diff --git a/.github/scripts/therock_configure_ci.py b/.github/scripts/therock_configure_ci.py
new file mode 100644
index 0000000000..557afe2d84
--- /dev/null
+++ b/.github/scripts/therock_configure_ci.py
@@ -0,0 +1,112 @@
+import fnmatch
+import json
+import os
+from pathlib import Path
+import subprocess
+import sys
+from typing import Iterable, Optional, Mapping
+
+def gha_set_output(vars: Mapping[str, str | Path]):
+    """Sets values in a step's output parameters.
+
+    This appends to the file located at the $GITHUB_OUTPUT environment variable.
+
+    See
+      * https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter
+      * https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs
+    """
+    print(f"Setting github output:\n{vars}")
+
+    step_output_file = os.getenv("GITHUB_OUTPUT")
+    if not step_output_file:
+        print("  Warning: GITHUB_OUTPUT env var not set, can't set github outputs")
+        return
+
+    with open(step_output_file, "a") as f:
+        f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items())
+
+def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
+    """Returns the paths of modified files relative to the base reference."""
+    try:
+        return subprocess.run(
+            ["git", "diff", "--name-only", base_ref],
+            stdout=subprocess.PIPE,
+            check=True,
+            text=True,
+            timeout=60,
+        ).stdout.splitlines()
+    except TimeoutError:
+        print(
+            "Computing modified files timed out. Not using PR diff to determine"
+            " jobs to run.",
+            file=sys.stderr,
+        )
+        return None
+
+# Paths matching any of these patterns are considered to have no influence over
+# build or test workflows so any related jobs can be skipped if all paths
+# modified by a commit/PR match a pattern in this list.
+SKIPPABLE_PATH_PATTERNS = [
+    "docs/*",
+    "*.gitignore",
+    "*.md",
+    "*.pre-commit-config.*",
+    "*LICENSE",
+    'Jenkinsfile',
+    '.github/ISSUE_TEMPLATE/*',
+    '.github/CODEOWNERS',
+    '.github/*.md',
+    '.github/dependabot.yml',
+]
+
+def is_path_skippable(path: str) -> bool:
+    """Determines if a given relative path to a file matches any skippable patterns."""
+    return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS)
+
+def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool:
+    """Returns true if at least one path is not in the skippable set."""
+    if paths is None:
+        return False
+    return any(not is_path_skippable(p) for p in paths)
+
+def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
+    """Returns true if CI workflows should run given a list of modified paths."""
+
+    if paths is None:
+        print("No files were modified, skipping TheRock CI jobs")
+        return False
+
+    paths_set = set(paths)
+    github_workflows_paths = set(
+        [p for p in paths if p.startswith(".github/workflows")]
+    )
+    other_paths = paths_set - github_workflows_paths
+
+    contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
+
+    print("should_ci_run_given_modified_paths findings:")
+    print(f"  contains_other_non_skippable_files: {contains_other_non_skippable_files}")
+
+    if contains_other_non_skippable_files:
+        print("Enabling TheRock CI jobs since a non-skippable path was modified")
+        return True
+    else:
+        print(
+            "Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs"
+        )
+        return False
+
+def main(args):
+    base_ref = args.get("base_ref")
+    modified_paths = get_modified_paths(base_ref)
+    print("modified_paths (max 200):", modified_paths[:200])
+    enable_jobs = should_ci_run_given_modified_paths(modified_paths)
+    output = {
+        'enable_therock_ci': json.dumps(enable_jobs)
+    }
+    gha_set_output(output)
+
+if __name__ == "__main__":
+    args = {}
+    args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1")
+    main(args)
diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml
new file mode 100644
index 0000000000..7db124d2a1
--- /dev/null
+++ b/.github/workflows/therock-ci-linux.yml
@@ -0,0 +1,130 @@
+name: TheRock CI Linux
+
+on:
+  workflow_call:
+    inputs:
+      cmake_options:
+        type: string
+      amdgpu_families:
+        type: string
+      test_runs_on:
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  therock-build-linux:
+    name: Build Linux Packages
+    runs-on: azure-linux-scale-rocm
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:044b113562629f4bd2ec5d2e64b32eee11562d48fb1a75d7493daec9dd8d8292
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+      TEATIME_FORCE_INTERACTIVE: 0
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+    steps:
+      - name: Checkout composable_kernel repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Checkout TheRock repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/TheRock"
+          ref: ec1c2ef4f2636bce7733fd8c95e1dbb6692c8a57
+          path: "TheRock"
+
+      - name: Runner Health Settings
+        run: |
+          df -h
+          cmake --version
+          echo "Installed Python versions:"
+          ls -d /opt/python
+          echo "python: $(which python), python3: $(which python3)"
+          echo "Git version: $(git --version)"
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+      
+      - name: Fetch sources
+        run: |
+          ./TheRock/build_tools/fetch_sources.py --jobs 12
+
+      - name: Install python deps
+        run: |
+          pip install -r TheRock/requirements.txt
+          pip freeze
+
+      - name: Configure Projects
+        env:
+          amdgpu_families: ${{ env.AMDGPU_FAMILIES }}
+          package_version: ADHOCBUILD
+          extra_cmake_options: ${{ inputs.cmake_options }}
+          BUILD_DIR: build
+        run: |
+          python3 TheRock/build_tools/github_actions/build_configure.py
+
+      - name: Build TheRock
+        run: cmake --build TheRock/build
+
+      - name: Build therock-archives
+        run: cmake --build TheRock/build --target therock-archives
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "Full SDK du:"
+          echo "------------"
+          du -h -d 1 TheRock/build/dist/rocm
+          echo "Artifact Archives:"
+          echo "------------------"
+          ls -lh TheRock/build/artifacts/*.tar.xz
+          echo "Artifacts:"
+          echo "----------"
+          du -h -d 1 TheRock/build/artifacts
+
+      - name: Configure AWS Credentials for non-forked repos
+        if: ${{ always() && !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
+
+      - name: Create Logs index Files and upload logs
+        if: always()
+        run: |
+          python3 TheRock/build_tools/github_actions/create_log_index.py \
+            --build-dir=TheRock/build \
+            --amdgpu-family=${{ env.AMDGPU_FAMILIES }}
+
+          python3 TheRock/build_tools/github_actions/upload_build_logs_to_s3.py \
+            --build-dir=TheRock/build \
+            --run-id ${{ github.run_id }} \
+            --amdgpu-family ${{ env.AMDGPU_FAMILIES }}
+
+      - name: Upload artifacts
+        run: |
+          python TheRock/build_tools/github_actions/upload_build_artifacts.py \
+            --run-id ${{ github.run_id }} \
+            --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
+            --build-dir TheRock/build
+
+      - name: Add Links to Job Summary
+        if: always()
+        run: |
+          python TheRock/build_tools/github_actions/upload_build_summary.py \
+            --run-id ${{ github.run_id }} \
+            --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
+            --build-dir TheRock/build
+
+  therock-test-linux:
+    name: "Test"
+    needs: [therock-build-linux]
+    uses: ./.github/workflows/therock-test-packages.yml
+    with:
+      project_to_test: "miopen"
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      test_runs_on: ${{ inputs.test_runs_on }}
+      platform: "linux"
diff --git a/.github/workflows/therock-ci.yml b/.github/workflows/therock-ci.yml
new file mode 100644
index 0000000000..3232652b6b
--- /dev/null
+++ b/.github/workflows/therock-ci.yml
@@ -0,0 +1,81 @@
+name: TheRock CI for composable_kernel
+
+on:
+  push:
+    branches:
+      - develop
+  workflow_dispatch:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+    branches:
+      - mainline
+      - release/*
+      - release-staging/*
+      - develop
+
+permissions:
+  contents: read
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  setup:
+    runs-on: ubuntu-24.04
+    env:
+      # The commit being checked out is the merge commit for a PR. Its first
+      # parent will be the tip of the base branch.
+      BASE_REF: HEAD^
+    outputs:
+      enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }}
+    steps:
+      - name: "Checking out repository"
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          # We need the parent commit to do a diff
+          fetch-depth: 2
+
+      - name: "Configuring CI options"
+        id: configure
+        run: python .github/scripts/therock_configure_ci.py
+
+  therock-ci-linux:
+    name: TheRock CI Linux
+    needs: setup
+    if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }}
+    permissions:
+      contents: read
+      id-token: write
+    uses: ./.github/workflows/therock-ci-linux.yml
+    secrets: inherit
+    with:
+      cmake_options: "-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON -DTHEROCK_ENABLE_MIOPEN=ON -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_USE_EXTERNAL_CK=ON -DTHEROCK_CK_SOURCE_DIR=../"
+      amdgpu_families: "gfx94X-dcgpu"
+      test_runs_on: "linux-mi325-1gpu-ossci-rocm"
+
+  therock_ci_summary:
+    name: TheRock CI Summary
+    if: always()
+    needs:
+      - setup
+      - therock-ci-linux
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Output failed jobs
+        run: |
+          echo '${{ toJson(needs) }}'
+          FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
+            | jq --raw-output \
+            'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
+          )"
+          if [[ "${FAILED_JOBS}" != "" ]]; then
+            echo "The following jobs failed: ${FAILED_JOBS}"
+            exit 1
+          fi
diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml
new file mode 100644
index 0000000000..37ddd399ad
--- /dev/null
+++ b/.github/workflows/therock-test-packages.yml
@@ -0,0 +1,77 @@
+name: TheRock Test Packages
+
+on:
+  workflow_call:
+    inputs:
+      project_to_test:
+        type: string
+      amdgpu_families:
+        type: string
+      test_runs_on:
+        type: string
+      platform:
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  configure_test_matrix:
+    name: "Configure test matrix"
+    runs-on: ubuntu-24.04
+    if: ${{ inputs.test_runs_on != '' }}
+    outputs:
+      components: ${{ steps.configure.outputs.components }}
+    steps:
+      - name: "Checking out repository"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/TheRock"
+
+      - name: "Configuring CI options"
+        env:
+          PLATFORM: ${{ inputs.platform }}
+          project_to_test: ${{ inputs.project_to_test }}
+        id: configure
+        run: python ./build_tools/github_actions/fetch_test_configurations.py
+
+  test_components:
+    name: 'Test ${{ matrix.components.job_name }}'
+    runs-on: ${{ inputs.test_runs_on }}
+    needs: configure_test_matrix
+    # skip tests if no test matrix to run
+    if: ${{ needs.configure_test_matrix.outputs.components != '[]' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      ARTIFACT_RUN_ID: "${{ github.run_id }}"
+      OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build
+      THEROCK_BIN_DIR: "./build/bin"
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/TheRock"
+
+      - name: Run setup test environment workflow
+        uses: './.github/actions/setup_test_environment'
+        with:
+          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
+          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
+          VENV_DIR: ${{ env.VENV_DIR }}
+          FETCH_ARTIFACT_ARGS: ${{ matrix.components.fetch_artifact_args }}
+          PLATFORM: ${{ inputs.platform }}
+          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+
+      - name: Test
+        timeout-minutes: ${{ matrix.components.timeout_minutes }}
+        run: |
+          if [ "${{ inputs.PLATFORM }}" == "linux" ]; then source ${VENV_DIR}/bin/activate ; else . ${VENV_DIR}/Scripts/activate ; fi
+          ${{ matrix.components.test_script }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9c942a776d..1246248eac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added rotating buffer feature for CK_Tile GEMM.
 * Added int8 support for CK_TILE GEMM.
 * Added support for elementwise kernel.
+* Added benchmarking support for tile engine GEMM Multi D.
 
 ### Optimized
 
@@ -47,6 +48,7 @@ None
 * Number of instances in instance factory for grouped convolution forward NGCHW/GKYXC/NGKHW has been reduced.
 * Number of instances in instance factory for grouped convolution backward weight NGCHW/GKYXC/NGKHW has been reduced.
 * Number of instances in instance factory for grouped convolution backward data NGCHW/GKYXC/NGKHW has been reduced.
+* Removed `BlockSize` in `make_kernel` and `CShuffleEpilogueProblem` to support Wave32 in CK_TILE (#2594)
 
 ### Known issues
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19c036e1a5..35ebba8085 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -336,6 +336,11 @@ if(ENABLE_ASM_DUMP)
     message("CK compiled with ENABLE_ASM_DUMP set to ${ENABLE_ASM_DUMP}")
 endif()
 
+if(USE_OPT_GFX12 AND (SUPPORTED_GPU_TARGETS MATCHES "gfx12"))
+    add_compile_options(-mno-wavefrontsize64)
+    message(STATUS "CK compiled with USE_OPT_GFX12 set to ${USE_OPT_GFX12}")
+endif()
+
 ## Threads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
diff --git a/Jenkinsfile b/Jenkinsfile
index 590ee92e90..b3b63098c2 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -401,7 +401,8 @@ def cmake_build(Map conf=[:]){
                     sh 'ninja -j64 package'
                     archiveArtifacts artifacts: 'composablekernel-dev*.deb'
                     sh 'mv composablekernel-dev_*.deb composablekernel-dev_all_targets_1.1.0_amd64.deb'
-                    stash includes: "composablekernel-dev_all_targets_1.1.0_amd64.deb", name: "packages"
+                    sh 'mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.1.0_amd64.deb'
+                    stash includes: "composablekernel-**.deb", name: "packages"
                 }
             }
             else{
@@ -460,7 +461,9 @@ def buildHipClangJob(Map conf=[:]){
         }
         def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
         if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
-            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+            // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
+            // newer clang22 compilers and running with older hip runtima libraries
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 "
         }
         def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
         def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
@@ -518,7 +521,9 @@ def Build_CK(Map conf=[:]){
         }
         def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
         if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
-            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+            // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
+            // newer clang22 compilers and running with older hip runtima libraries
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 "
         }
         if(params.BUILD_LEGACY_OS){
             dockerOpts = dockerOpts + " --env LD_LIBRARY_PATH='/opt/Python-3.8.13/lib' "
@@ -567,19 +572,6 @@ def Build_CK(Map conf=[:]){
                                   python3 -m pytest python/test/test_gen_instances.py
                             """
                     }
-                    dir("build"){
-                        if (params.RUN_FULL_QA && arch == 2 ){
-                            // build deb packages
-                            echo "Build packages"
-                            sh 'ninja package'
-                            archiveArtifacts artifacts: 'composablekernel*.deb'
-                            sh 'mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.1.0_amd64.deb'
-                            sh 'mv composablekernel-dev_*.deb composablekernel-dev_1.1.0_amd64.deb'
-                            sh 'mv composablekernel-examples_*.deb composablekernel-examples_1.1.0_amd64.deb'
-                            sh 'mv composablekernel-tests_*.deb composablekernel-tests_1.1.0_amd64.deb'
-                            stash includes: "composablekernel-**.deb", name: "packages"
-                        }
-                    }
                     // run performance tests, stash the logs, results will be processed on the master node
 					dir("script"){
                         if (params.RUN_PERFORMANCE_TESTS){
@@ -734,7 +726,7 @@ def process_results(Map conf=[:]){
                             echo "could not locate the FMHA performance logs: ${err.getMessage()}."
                         }
                     }
-                    if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){
+                    if (params.BUILD_INSTANCES_ONLY){
                         // unstash deb packages
                         unstash "packages"
                         sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no composablekernel-*.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
@@ -888,6 +880,10 @@ pipeline {
             name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
             defaultValue: false,
             description: "Run the grouped conv large cases tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CONV_COMPREHENSIVE_DATASET",
+            defaultValue: false,
+            description: "Run comprehensive convolution dataset tests before important changes (default: OFF)")
         booleanParam(
             name: "RUN_CODEGEN_TESTS",
             defaultValue: true,
@@ -1086,6 +1082,33 @@ pipeline {
                 }
             }
         }
+        stage("Run Comprehensive Convolution Dataset Tests")
+        {
+            parallel
+            {
+                stage("Run Comprehensive Dataset Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CONV_COMPREHENSIVE_DATASET.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a")}
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ cd test_data && \
+                                           ./generate_test_dataset.sh && \
+                                           cd ../script && \
+                                           ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 test_grouped_convnd_fwd_dataset_xdl && \
+                                           ./bin/test_grouped_convnd_fwd_dataset_xdl"""
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
         stage("Run Codegen Tests")
         {
             parallel
@@ -1172,6 +1195,8 @@ pipeline {
                                             -D GPU_TARGETS="gfx90a" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
+                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
                                            ninja -j64 benchmark_gemm_fp8_rcr && \
                                            ./bin/benchmark_gemm_fp8_rcr && \
@@ -1188,7 +1213,15 @@ pipeline {
                                            ninja -j64 benchmark_gemm_fp8_rrr && \
                                            ./bin/benchmark_gemm_fp8_rrr && \
                                            ninja -j64 benchmark_gemm_fp16_rrr && \
-                                           ./bin/benchmark_gemm_fp16_rrr """
+                                           ./bin/benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_ccrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_crrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_crrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_rcrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_rcrr """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1210,6 +1243,8 @@ pipeline {
                                             -D GPU_TARGETS="gfx942" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
+                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
                                            ninja -j64 benchmark_gemm_fp8_rcr && \
                                            ./bin/benchmark_gemm_fp8_rcr && \
@@ -1226,7 +1261,15 @@ pipeline {
                                            ninja -j64 benchmark_gemm_fp8_rrr && \
                                            ./bin/benchmark_gemm_fp8_rrr && \
                                            ninja -j64 benchmark_gemm_fp16_rrr && \
-                                           ./bin/benchmark_gemm_fp16_rrr """
+                                           ./bin/benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_ccrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_crrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_crrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_rcrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_rcrr """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1385,7 +1428,7 @@ pipeline {
                                     -D CMAKE_BUILD_TYPE=Release \
                                     -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
                             
-                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0")
                         }
                         cleanWs()
                     }
@@ -1419,7 +1462,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1101") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DUSE_OPT_GFX11=ON -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx11-generic" \
@@ -1440,7 +1483,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1201") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx12-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx12-generic" -DUSE_OPT_GFX12=ON -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx12-generic" \
@@ -1462,7 +1505,7 @@ pipeline {
                 stage("Process results"){
                     when {
                         beforeAgent true
-                        expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { (params.RUN_PERFORMANCE_TESTS.toBoolean() || params.BUILD_INSTANCES_ONLY.toBoolean()) && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent { label 'mici' }
                     steps{
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index 6c5d9f9fba..3e018aad1e 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -1,7 +1,8 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
+#include "ck/library/utility/validation_common.hpp"
 
 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
@@ -53,6 +54,17 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
     StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
 
+    try
+    {
+        ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+            M, N, K, StrideA, StrideB, StrideC);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return false;
+    }
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
 
diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc
index 4adb6f896b..3d8cf32221 100644
--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
index 18731e810e..03c531c1ad 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
index 87812369bd..5167097b6d 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
index c3e6ef7d5d..abf7ef3905 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
index 93034a8b70..2582ea8a11 100644
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
index e7c1d6f0be..57e2feb084 100644
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/ck_tile/01_fmha/README.md b/example/ck_tile/01_fmha/README.md
index 72109a660b..f72d7afa02 100644
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -7,7 +7,7 @@ This folder contains example for fmha(fused multi-head attention) using ck_tile
 # in the root of ck_tile
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_fmha_fwd -j
 ```
 This will result in an executable `build/bin/tile_example_fmha_fwd`
diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index 6fca800c90..42a9d5148a 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -115,6 +115,7 @@ PIPELINE_MAP = {
     "qr" : "ck_tile::BlockFmhaPipelineQRKSVS",
     "qr_async" : "ck_tile::BlockFmhaPipelineQRKSVSAsync",
     "qs" : "ck_tile::BlockFmhaPipelineQSKSVS",
+    "qr_async_trload" : "ck_tile::BlockFmhaPipelineQRKSVSAsyncTrload",
 }
 
 PIPELINE_ENUM_MAP = {
@@ -123,6 +124,7 @@ PIPELINE_ENUM_MAP = {
     "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
     "qs" : "ck_tile::BlockFmhaPipelineEnum::QSKSVS",
     "qr_pagedkv" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qr_async_trload" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD",
 }
 
 BOOL_MAP = {
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
index ffb6d579ed..0d8f366d8a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -84,6 +84,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
     {F_mode},
     fmha_variant_{F_idx},
     fmha_mask_{F_idx},
+    false,
     fmha_trait_{F_idx}>;
 
 using fmha_pipeline_{F_idx} = {F_pipeline}<
@@ -98,7 +99,7 @@ using fmha_kernel_{F_idx} =
     ck_tile::FmhaBatchPrefillWithPagedKVCacheKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, false>;
 
 #include <iostream>
 
@@ -109,9 +110,9 @@ float fmha_batch_prefill_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_b
     if(s.log_level_ > 0)
         std::cout << ", " << k_::GetName() << std::flush;
     auto [kargs, grids] = fmha_batch_prefill_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks             = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
-    return ck_tile::launch_kernel(s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
 }}
 """
 
@@ -177,7 +178,7 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, false>;
                 return fmha_batch_prefill_<trait_>(s, a);
             }}
 """
@@ -507,8 +508,8 @@ class KernelComponentFactory:
             for logits, mask, bias, lse, dropout in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
                     pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
                     pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    # pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    # pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
         else:
             assert False
         return pipelines
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index bb3a0587e7..0391191fb2 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -136,10 +136,10 @@ float fmha_bwd_dq_dk_dv_<dq_dk_dv_trait_{F_idx}>(const ck_tile::stream_config& s
     if(s.log_level_ > 0)
         std::cout << ", " << k_::GetName() << std::flush;
     auto [kargs, grids]                    = fmha_bwd_dq_dk_dv_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks                  = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
 }}
 
 template <>
@@ -148,9 +148,9 @@ void fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_{F_idx}>(const ck_tile::stream_co
 {{
     using k_                               = fmha_bwd_dq_dk_dv_kernel_{F_idx};
     auto [kargs, grids]                    = fmha_bwd_dq_dk_dv_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks                  = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
-    ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(
+    ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(
         ck_tile::stream_config{{s.stream_id_}});
 }}
 
@@ -425,10 +425,10 @@ float fmha_bwd_dot_do_o_<dot_do_o_trait_{F_idx}>(const ck_tile::stream_config& s
     if(s.log_level_ > 0)
         std::cout << ", " << k_::GetName() << std::flush;
     auto [kargs, grids]                    = fmha_bwd_dot_do_o_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks                  = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
 }}
 
 template <>
@@ -436,9 +436,9 @@ void fmha_bwd_dot_do_o_oneshot_<dot_do_o_trait_{F_idx}>(const ck_tile::stream_co
 {{
     using k_                               = fmha_bwd_dot_do_o_kernel_{F_idx};
     auto [kargs, grids]                    = fmha_bwd_dot_do_o_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks                  = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
-    ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(
+    ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(
         ck_tile::stream_config{{s.stream_id_}});
 }}
 
@@ -530,10 +530,10 @@ float fmha_bwd_convert_dq_<convert_dq_trait_{F_idx}>(const ck_tile::stream_confi
     if(s.log_level_ > 0)
         std::cout << ", " << k_::GetName() << std::flush;
     auto [kargs, grids]                    = fmha_bwd_convert_dq_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks                  = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
 }}
 
 template <>
@@ -542,9 +542,9 @@ void fmha_bwd_convert_dq_oneshot_<convert_dq_trait_{F_idx}>(const ck_tile::strea
 {{
     using k_                               = fmha_bwd_convert_dq_kernel_{F_idx};
     auto [kargs, grids]                    = fmha_bwd_convert_dq_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks                  = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
-    ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(
+    ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(
         ck_tile::stream_config{{s.stream_id_}});
 }}
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 95202a5f72..78668729f4 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -12,6 +12,7 @@ from typing import List, Optional, Tuple
 
 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
+from codegen.utils import update_file
 
 
 DTYPE_BITS = {
@@ -83,6 +84,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
     {F_mode},
     fmha_variant_{F_idx},
     fmha_mask_{F_idx},
+    {F_trload},
     fmha_trait_{F_idx}>;
 
 using fmha_pipeline_{F_idx} = {F_pipeline}<
@@ -97,7 +99,7 @@ using fmha_kernel_{F_idx} =
     ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
 
 #include <iostream>
 
@@ -108,9 +110,9 @@ float fmha_fwd_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_args a)
     if(s.log_level_ > 0)
         std::cout << ", " << k_::GetName() << std::flush;
     auto [kargs, grids] = fmha_fwd_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks             = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
-    return ck_tile::launch_kernel(s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
 }}
 """
 
@@ -161,12 +163,19 @@ float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config&
     [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
         return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
     }};
+    
+    const bool has_load_tr = ck_tile::is_load_tr_supported();
 
 {F_dispatch}
     return r;
 }}
 """
 
+FMHA_FWD_API_PER_TRLOAD="""    {F_if}({F_trload_cond}){{
+{F_dtype_case}
+    }}
+"""
+
 FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
 {F_hdim_case}
     }}
@@ -177,8 +186,8 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 """
 
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+                        ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
                 return fmha_fwd_<trait_>(s, a);
             }}
 """
@@ -221,6 +230,7 @@ class FmhaFwdApiTrait:
     dpad       : str
     dvpad      : str
     skip       : str
+    tr_load    : str
     constraint : CppConstraint
 
     @property
@@ -231,13 +241,19 @@ class FmhaFwdApiTrait:
     @property
     def scheck(self) -> str:
         if self.mode == 'group': return 'true/*group mode spad always true*/'                  # group mode only generate spad/skpad == true
-        if self.pipeline_tag == 'qr_async':
+        if self.pipeline_tag in ['qr_async', 'qr_async_trload']:
             if self.spad == 't' : return 'true' # always support
             else :                return 'true'
         elif self.pipeline_tag in ['qr', 'qs']:
             if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_q % {self.bm0} == 0'
         else: assert False
+    
+    @property
+    def seqtune(self) -> str:
+        if self.bm0 == 128: return 'true/*fall back to largest tile*/'                  # group mode only generate spad/skpad == true
+        else: 
+            return f'a.seqlen_q <= {self.bm0}'
 
     @property
     def skcheck(self) -> str:
@@ -248,6 +264,9 @@ class FmhaFwdApiTrait:
         elif self.pipeline_tag in ['qr', 'qs']:
             if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_k % {self.bn0} == 0'
+        elif self.pipeline_tag == 'qr_async_trload':
+            if self.skpad == 't' : return 'true'
+            else:                  return 'true'
         else: assert False
 
     @property
@@ -256,7 +275,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
             else :               assert False
-        elif self.pipeline_tag in ['qr', 'qs']:
+        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :               return f'a.hdim_q % {bk0submax} == 0'
@@ -268,7 +287,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
             else :                assert False
-        elif self.pipeline_tag in ['qr', 'qs']:
+        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.hdim_v % {bk0submax} == 0'
@@ -290,6 +309,7 @@ class FmhaFwdPipeline:
     F_squant     : str  #
     F_mask       : str  # value from MASK_MAP
     F_skip       : str  # true/false
+    F_trload     : str  # true/false
     F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
 
     @property
@@ -331,6 +351,9 @@ class FmhaFwdPipeline:
 
         if self.F_squant == 't' : n += '_squant'
         else: n += '_nsquant'
+        
+        if self.F_trload == 't' : n += '_trload'
+        else: n += '_ntrload'
 
         return n
 
@@ -351,31 +374,39 @@ class FmhaFwdApiPool:
 
     @property
     def api(self) -> str:
-        per_dtypes=str()
-        for i, dtype in enumerate(self.pool.keys()):
-            per_hdim_case=str()
-            for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
-                traits=self.pool[dtype][(hdim, hdim_v)]
-                inners=str()
-                for k, trait in enumerate(traits):
-                    if_k = 'if' if k == 0 else 'else if'
-                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
-                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
-                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
-                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip],
-                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
-                                   F_constraint=trait.constraint,
-                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
-                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
-                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
-            if_i = 'if' if i == 0 else 'else if'
-            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
-        if not per_dtypes:
+        tr_load_cond_map = {
+            "t": "has_load_tr",
+            "f": "true"
+        }
+        
+        per_tr_load =str()
+        for tr_load in ["t", "f"]:
+            per_dtypes=str()
+            for i, dtype in enumerate(self.pool.keys()):
+                per_hdim_case=str()
+                for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
+                    traits=[t for t in self.pool[dtype][(hdim, hdim_v)] if tr_load == t.tr_load]
+                    inners=str()
+                    for k, trait in enumerate(traits):
+                        if_k = 'if' if k == 0 else 'else if'
+                        inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
+                                       F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                       F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
+                                       F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip], F_trload=BOOL_MAP[trait.tr_load],
+                                       F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_seqtune=trait.seqtune, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                       F_constraint=trait.constraint,
+                                       F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                       F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
+                                       F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
+                    if_j = 'if' if j == 0 else 'else if'
+                    per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
+                if_i = 'if' if i == 0 else 'else if'
+                per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+            per_tr_load += FMHA_FWD_API_PER_TRLOAD.format(F_if='if', F_trload_cond=tr_load_cond_map[tr_load], F_dtype_case=per_dtypes)
+        if not per_tr_load:
             # empty string we add some ignore to suppress warning in api
-            per_dtypes += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
+            per_tr_load += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_tr_load)
 
 @dataclass
 class FmhaFwdTileSize:
@@ -458,7 +489,8 @@ class FmhaFwdKernel:
                 F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
                 F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
                 F_mode          = MODE_MAP[self.F_mode],
-                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag])
+                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag],
+                F_trload        = BOOL_MAP[self.F_pipeline.F_trload])
 
     @property
     def name(self) -> str:
@@ -494,6 +526,7 @@ class FmhaFwdKernel:
                 dpad=self.F_pipeline.F_dpad,
                 dvpad=self.F_pipeline.F_dvpad,
                 skip=self.F_pipeline.F_skip,
+                tr_load=self.F_pipeline.F_trload,
                 constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint)
 
 class KernelComponentFactory:
@@ -503,11 +536,16 @@ class KernelComponentFactory:
     def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
         if dtype == 'fp16' or dtype == 'bf16':
             return {
-                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (64, 64)  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (64, 64)  : [FmhaFwdTileSize(16, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
+                             FmhaFwdTileSize(32, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                (128,128) : [FmhaFwdTileSize(16, 32, 64, 128, 32,  128,  1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
+                             FmhaFwdTileSize(32, 32, 128, 128, 32,  128,  1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 64, 32, 128, 16,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
@@ -534,37 +572,30 @@ class KernelComponentFactory:
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
                 if hdim == 256 and hdim_v == 256:
-                # if True:
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    # pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                     # the below two is used for hdim vectorize load
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    # pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    # pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                 else:
                     if bias == "bias":
                         # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        # pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                        # pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                     else:
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        # pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        # pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        if (hdim, hdim_v) in [(64, 64), (128, 128)] and logits == "f" and bias == "no" and dropout == "f" and lse == "f" and skip == "f":
+                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 't'))
+                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 't'))
                     if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
-                        # pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f')) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
             # no need lse/dropout kernels
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f'))
-                pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f'))
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f'))
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f', 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f', 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', 'f', mask, 'f', 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 'f', 'f', 'f', mask, 'f', 'f'))
         elif dtype in ['fp8fp16', 'fp8bf16']:
             # TODO
             None
@@ -602,6 +633,12 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                     if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                         continue
+                if pipeline.tag != 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 != 128) or ((hdim, hdim_v) != (128, 128) and tile.F_bm0 != 128)):
+                    # non qr_async_trload only support km0=128 tile size when hdim is not 128
+                    # non qr_async only support kn0=128 tile size when hdim is 128
+                    continue
+                if pipeline.tag == 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 == 128) or ((hdim, hdim_v) not in [(64, 64), (128, 128)])):
+                    continue
                 # logits_soft_cap is only allowed if no bias
                 if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
                     continue
@@ -668,10 +705,10 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
     return (api_pool, gen)
 
 def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
+    update_file(autogen_dir / kernel.filename, kernel.template)
 
 def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
-    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
+    update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api)
 
 def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
     api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
index 2e5bc2bd3d..0ebeaddf9c 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -60,9 +60,9 @@ float fmha_fwd_appendkv_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fw
     if(s.log_level_ > 0)
         std::cout << ", " << k_::GetName() << std::flush;
     auto [kargs, grids] = fmha_fwd_appendkv_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks             = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
-    return ck_tile::launch_kernel(s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
 }}
 """
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 0e4ac44d45..1dd8f0e3c6 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -108,9 +108,9 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
     using k_ = fmha_kernel;
     auto [kargs, grids] = fmha_fwd_splitkv_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks             = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
-    ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(ck_tile::stream_config{{s.stream_id_}});
+    ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(ck_tile::stream_config{{s.stream_id_}});
 }}
 }};
 }}
@@ -208,9 +208,9 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 {{
     using k_ = fmha_kernel;
     auto [kargs, grids] = fmha_fwd_splitkv_combine_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks             = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
-    ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(ck_tile::stream_config{{s.stream_id_}});
+    ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(ck_tile::stream_config{{s.stream_id_}});
 }}
 }};
 }}
@@ -638,7 +638,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
             '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-            '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            # '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
@@ -657,7 +657,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
             '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
-            '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            # '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
     }
     elif dtype == 'fp8' or dtype == 'bf8':
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
index a98d1d4423..43a69cca6c 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
@@ -109,9 +109,9 @@ float fmha_fwd_pagedkv_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd
     if(s.log_level_ > 0)
         std::cout << ", " << k_::GetName() << std::flush;
     auto [kargs, grids] = fmha_fwd_pagedkv_create_kargs_and_grids<k_>(a);
-    constexpr dim3 blocks             = k_::BlockSize();
+    const dim3 blocks                      = k_::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
-    return ck_tile::launch_kernel(s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
 }}
 """
 
diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp
index 9c2907778f..9f1e0f6948 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -809,20 +809,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::stream_config stream_config_v{
         nullptr, true, 0, 0, 1, arg_parser.get_str("timer") == std::string("gpu")};
-
-    printf("\nfmha_bwd_traits: hdim_q=%d, hdim_v=%d, data_type=%s, is_group_mode=%d, mask_type=%d, "
-           "bias_type=%d, has_dbias=%d, has_dropout=%d, is_store_randval=%d, is_deterministic=%d\n",
-           fmha_traits.hdim_q,
-           fmha_traits.hdim_v,
-           fmha_traits.data_type.c_str(),
-           fmha_traits.is_group_mode,
-           static_cast<int>(fmha_traits.mask_type),
-           static_cast<int>(fmha_traits.bias_type),
-           fmha_traits.has_dbias,
-           fmha_traits.has_dropout,
-           fmha_traits.is_store_randval,
-           fmha_traits.is_deterministic);
-    fflush(stdout);
     fmha_bwd(fmha_traits, fmha_args, stream_config_v);
 
     dq_buf.FromDevice(dq_host.data());
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index ee599c973b..777ae59db3 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1144,7 +1144,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
               << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
-              << " GB/s" << std::flush;
+              << " GB/s" << std::flush << std::endl;
 
     if(do_validation == 0)
     {
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index bd5e110214..8c712b0aa7 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/fmha.hpp"
@@ -1028,6 +1029,7 @@ template <ck_tile::index_t HDim_,
           bool kPadSK_,
           bool kPadD_,
           bool kPadDv_,
+          bool kUseTrLoad_,
           bool kSkipMinSeqlenQ_ = false>
 struct fmha_fwd_traits_
 {
@@ -1052,6 +1054,7 @@ struct fmha_fwd_traits_
     static constexpr bool kPadSK                     = kPadSK_;
     static constexpr bool kPadD                      = kPadD_;
     static constexpr bool kPadDv                     = kPadDv_;
+    static constexpr bool kUseTrLoad                 = kUseTrLoad_;
     static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
 };
 
diff --git a/example/ck_tile/01_fmha/script/benchmark_fwd.sh b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
index 599c595a75..88c16cceb6 100755
--- a/example/ck_tile/01_fmha/script/benchmark_fwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
@@ -18,14 +18,3 @@ $EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kn
 done
 done
 done
-
-for perm in 0 1 ; do
-
-$EXE -prec=fp8 -squant=1 -b=32 -h=16 -d=128 -s=512   -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=16 -h=16 -d=128 -s=1024  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=8  -h=16 -d=128 -s=2048  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=4  -h=16 -d=128 -s=4096  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=2  -h=16 -d=128 -s=8192  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=1  -h=16 -d=128 -s=16384 -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-
-done
\ No newline at end of file
diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh
index b5e6778aa5..e7babd2744 100755
--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
@@ -9,6 +9,8 @@
 # host name        : $hostname
 # gpu architecture: e.g., gfx90a, or gfx942, etc.
 
+set -euo pipefail
+
 #get the command line arguments:
 export env_type=$1
 echo 'Environment type: ' $env_type
diff --git a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
index 5ba3425e26..d123f842a2 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
@@ -1,5 +1,7 @@
-#!/bin/sh
+#!/bin/bash
 # TODO: run this script from CK root or build directory
+set -euo pipefail
+
 EXE="$(find . -name tile_example_fmha_bwd -type f | head -n 1)"
 KNAME=1
 
@@ -17,12 +19,12 @@ for dbias in 0 ; do
 for p_drop in 0.0 0.2 ; do
 for deterministic in 0 ; do
 
-$EXE -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=2 -h=2 -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=1 -h=2 -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2 -v=1 -deterministic=$deterministic -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=2 -h=1 -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
-$EXE -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259          -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm                -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=2 -h=2        -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm                -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1        -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=1 -h=2        -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2        -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=2 -h=1        -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35   -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
 
 done
 done
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
index b867cd6c07..3913a0d5c2 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 # TODO: run this script from CK root or build directory
+set -euo pipefail
+
 EXE="$(find . -name tile_example_fmha_fwd -type f | head -n 1)"
 KNAME=1
 
@@ -42,7 +44,6 @@ run_fp16_bf16_tests() {
     for prec in "fp16" "bf16" ; do
     for mode in 1 0 ; do
     for perm in 0 1 ; do
-    for vlayout in "r" "c" ; do
     for hdim in 32 64 128 256 ; do
     for lse in 0 1 ; do
     for bias in "n" "e" "a" ; do
@@ -51,20 +52,19 @@ run_fp16_bf16_tests() {
     for page_block_size in $PAGE_BLOCK_SIZE ; do
     for cache_batch_idx in $CACHE_BATCH_IDX ; do
 
-    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16    -d_v=$hdim -s=55   -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm                -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=3        -d=$hdim            -s=100  -s_k=51             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm                -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1        -d=16    -d_v=$hdim -s=99   -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=1024 -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1        -d=$hdim -d_v=24    -s=3    -s_k=99             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim            -s=200  -s_k=520            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1        -d=$hdim            -s=99   -s_k=32             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35   -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=33   -s_k=0              -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=1    -s_k=10  -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
 
     done ; done ; done ; done ; done
     done ; done ; done ; done ; done
-    done ;
 }
 
 run_fp8_tests() {
diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md
index 817f62dae7..da74e2e3c1 100644
--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -42,7 +42,7 @@ return hidden_states, per_token_scale
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
 make tile_example_layernorm2d_fwd -j
 ```
 This will result in an executable `build/bin/tile_example_layernorm2d_fwd`
diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index d77582630a..c4366f6662 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -235,7 +235,7 @@ float layernorm2d_fwd_(const S& s, A a)
     using Kernel = ck_tile::Layernorm2dFwd<Pipeline, Epilogue>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
@@ -243,7 +243,7 @@ float layernorm2d_fwd_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
 }}
 
 """
diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index b1aede42c7..825cd6e522 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
 add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp)
+add_executable(tile_example_gemm_reduce EXCLUDE_FROM_ALL gemm_splitk_two_stage_reduce.cpp)
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
 set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
@@ -14,3 +15,4 @@ list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-rev
 target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
+target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md
index 20cc202176..6358b76fd9 100644
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -7,18 +7,19 @@ This folder contains example for GEMM using ck_tile tile-programming implementat
 # in the root of ck_tile
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 # The basic pipeline method on the gemm calculation
 make tile_example_gemm_basic -j
 # The memory bound pipeline on the gemm calculation
 make tile_example_gemm_universal -j
+# The weight preshuffle pipeline on the gemm calculation
+make tile_example_gemm_weight_preshuffle -j
 ```
 This will result in an executable `build/bin/tile_example_gemm_basic` & `build/bin/tile_example_gemm_universal`
 
 ## example
 ```
 args:
-          -b    batch size (default:1)
           -m    m dimension (default:1024)
           -n    n dimension (default:2048)
           -k    k dimension (default:64)
@@ -29,9 +30,11 @@ args:
    -stride_b    Tensor B stride (default:0)
    -stride_c    Tensor C stride (default:0)
           -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
-          -e    Absolute error tolerance (default:1e-5)
        -prec    data type. fp16/bf16/fp8/bf8/int8 (default:fp16)
      -warmup    number of iterations before benchmark the kernel (default:10)
      -repeat    number of iterations to benchmark the kernel (default:100)
       -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+    -split_k    splitK value (default:1)
+       -init    0:random, 1:linear, 2:constant (default:1)
+ -persistent    0:non-persistent, 1:persistent (default:0)
 ```
diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 0d9c2d9957..8cdbe39e86 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <tuple>
-
-#include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
 
 template <typename GemmConfig,
@@ -29,12 +20,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
     if constexpr(Persistent)
         std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
-    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu = 1;
 
     // This part comes from the Codegen
     constexpr ck_tile::index_t M_Tile = 256;
@@ -56,8 +41,12 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 
     using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
 
-    using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using CodegenGemmTraits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                                      GemmConfig::kPadN,
+                                                      GemmConfig::kPadK,
+                                                      ALayout,
+                                                      BLayout,
+                                                      CLayout>;
 
     using CodegenPipelineProblem = ck_tile::
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
@@ -76,7 +65,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                                              ck_tile::tuple<>,
                                              CLayout,
                                              ck_tile::element_wise::PassThrough,
-                                             CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
                                              M_Warp,
@@ -92,8 +80,8 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
         using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
         auto kargs   = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
         {
@@ -112,27 +100,27 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
         }
 
         float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            s, ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     };
 
     if(args.k_batch == 1)
     {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::set>{});
+        return Run(MemoryOpSet{});
     }
     else
     {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::atomic_add>{});
+        return Run(MemoryOpAtomicAdd{});
     }
 }
 
 #include "run_gemm_example.inc"
 
 template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
 {
     using Row = ck_tile::tensor_layout::gemm::RowMajor;
     using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
@@ -142,12 +130,12 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
         if(a_layout == "R" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+                arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+                arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -160,22 +148,22 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
         if(a_layout == "R" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+                arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "R" && b_layout == "R")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Row{}, Row{});
+                arg_parser, Row{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "R")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Row{}, Row{});
+                arg_parser, Col{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+                arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -184,38 +172,34 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     }
 }
 
-int run_gemm_example(int argc, char* argv[])
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
 
     if(data_type == "fp16")
     {
-        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf16")
     {
-        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "fp8")
     {
         return run_gemm_example_prec_type<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf8")
     {
         return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "i8")
     {
         return run_gemm_example_prec_type<ck_tile::int8_t, ck_tile::int8_t, int32_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "pk_int4_t")
     {
@@ -223,7 +207,7 @@ int run_gemm_example(int argc, char* argv[])
         if constexpr(GemmConfigBase::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
         {
             return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
-                a_layout, b_layout, argc, argv);
+                a_layout, b_layout, arg_parser);
         }
         else
         {
@@ -238,9 +222,13 @@ int run_gemm_example(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
     try
     {
-        return !run_gemm_example(argc, argv);
+        return !run_gemm_example(arg_parser);
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
new file mode 100644
index 0000000000..f42135a0b5
--- /dev/null
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
@@ -0,0 +1,1006 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+
+/**
+ * @brief Tile partitioner with output offset support.
+ *
+ * This partitioner extends the spatially local tile partitioner to support
+ * split-K reduction by providing workspace output offset calculation. Each K-split
+ * writes to a separate slice of the workspace: workspace[k_id * M * N].
+ */
+template <typename BlockGemmShapeType, ck_tile::index_t GroupNum, ck_tile::index_t M01>
+struct GemmSplitKTilePartitioner
+    : public ck_tile::GemmSpatiallyLocalTilePartitioner<BlockGemmShapeType, GroupNum, M01>
+{
+    using Base = ck_tile::GemmSpatiallyLocalTilePartitioner<BlockGemmShapeType, GroupNum, M01>;
+
+    // Inherit constructors and methods
+    using Base::Base;
+    using Base::GetLoopNum;
+
+    /**
+     * @brief Calculate output pointer offset for split-K reduction.
+     *
+     * @param kargs  Kernel arguments.
+     * @param k_id   Current K-split ID (from blockIdx.z or calculated k_batch).
+     * @return ck_tile::index_t  The offset for this K-split.
+     */
+    template <typename KernelArgs>
+    CK_TILE_HOST_DEVICE static ck_tile::index_t GetOutputOffset(const KernelArgs& kargs,
+                                                                ck_tile::index_t k_id) noexcept
+    {
+        // Each K-split gets its own M*N workspace slice
+        return (kargs.k_batch > 1) ? (k_id * kargs.M * kargs.N) : 0;
+    }
+};
+
+/**
+ * @brief Extended GEMM host arguments for two-stage split-K implementation
+ *
+ * This structure supports the two-stage split-K approach where:
+ * 1. Stage 1: GEMM writes partial results to workspace memory
+ * 2. Stage 2: Reduction kernel sums workspace results to final output
+ *
+ * The base class e_ptr points to workspace, while final_output_ptr points to the actual output
+ */
+struct GemmSplitKHostArgs : public ck_tile::GemmHostArgs
+{
+    using BaseArgs = ck_tile::GemmHostArgs;
+
+    CK_TILE_HOST GemmSplitKHostArgs() = default;
+    CK_TILE_HOST GemmSplitKHostArgs(const void* a_ptr_,
+                                    const void* b_ptr_,
+                                    void* workspace_ptr_, // Workspace for partial results
+                                    void* e_ptr_,         // Final output destination
+                                    ck_tile::index_t k_batch_,
+                                    ck_tile::index_t M_,
+                                    ck_tile::index_t N_,
+                                    ck_tile::index_t K_,
+                                    ck_tile::index_t stride_A_,
+                                    ck_tile::index_t stride_B_,
+                                    ck_tile::index_t workspace_stride_,
+                                    ck_tile::index_t stride_E_)
+        : BaseArgs(a_ptr_,
+                   b_ptr_,
+                   workspace_ptr_, // Base e_ptr = workspace_ptr
+                   k_batch_,
+                   M_,
+                   N_,
+                   K_,
+                   stride_A_,
+                   stride_B_,
+                   workspace_stride_),
+          final_output_ptr(e_ptr_),
+          final_stride_E(stride_E_)
+    {
+    }
+
+    void* final_output_ptr;          // Pointer to final output tensor
+    ck_tile::index_t final_stride_E; // Stride for final output tensor
+};
+
+/**
+ * @brief Stage 1: GEMM kernel that writes partial split-K results to workspace
+ *
+ * This function performs the matrix multiplication with split-K, where each
+ * K-split writes its partial result to a separate section of the workspace.
+ *
+ * Workspace layout: [k_batch, M, N] where each [M, N] slice contains
+ * partial results for one K-split.
+ *
+ * @param args Extended arguments containing workspace and final output pointers
+ * @param s Stream configuration for kernel execution
+ * @return Execution time in milliseconds
+ */
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm_stage1(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using TilePartitioner = GemmSplitKTilePartitioner<GemmShape,
+                                                      GemmConfig::TileParitionerGroupNum,
+                                                      GemmConfig::TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                           GemmConfig::kPadN,
+                                           GemmConfig::kPadK,
+                                           ALayout,
+                                           BLayout,
+                                           ELayout,
+                                           GemmConfig::NumWaveGroups>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent,
+                                                                 GemmConfig::NumWaveGroups,
+                                                                 GemmConfig::Preshuffle>;
+
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    float ave_time{0};
+
+    // Create base GEMM arguments pointing to workspace instead of final output
+    // The workspace will store partial results from each K-split
+    ck_tile::GemmHostArgs base_args(args.a_ptr,
+                                    args.b_ptr,
+                                    args.e_ptr,
+                                    args.k_batch,
+                                    args.M,
+                                    args.N,
+                                    args.K,
+                                    args.stride_A,
+                                    args.stride_B,
+                                    args.stride_E);
+
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;
+
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(base_args);
+
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        const dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Stage 1 - Launching GEMM kernel: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        // For workspace mode, always use SET operation since each K-split writes to separate memory
+        Run(has_hot_loop_,
+            tail_number_,
+            ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::set>{});
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    return ave_time;
+}
+
+/**
+ * @brief Stage 2: Reduction kernel that sums partial split-K results to final output
+ *
+ * This function reduces the partial results stored in workspace memory by stage 1.
+ * It sums across the k_batch dimension to produce the final GEMM result.
+ *
+ * Workspace layout: [k_batch, M, N] -> Final output: [M, N]
+ *
+ * @tparam CDataType Output data type
+ * @tparam ComputeDataType Computation precision for reduction
+ * @tparam ELayout Memory layout of output tensor
+ * @param args Extended arguments containing workspace and output information
+ * @param s Stream configuration for kernel execution
+ * @return Execution time in milliseconds
+ */
+template <typename CDataType,
+          typename ComputeDataType = float,
+          typename ELayout         = ck_tile::tensor_layout::gemm::RowMajor>
+float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
+{
+    const ck_tile::index_t reduce_dim_size = args.k_batch; // Number of partial results to reduce
+    // Calculate output size based on the final output tensor dimensions
+    const ck_tile::index_t output_size = args.M * args.N;
+
+    // Workspace layout: [k_batch, M, N] where each [M, N] slice has the same layout as final output
+    // The workspace strides need to account for the layout of the final output tensor
+    auto workspace_shape = ck_tile::make_tuple(args.k_batch, args.M, args.N);
+    auto workspace_strides =
+        ck_tile::make_tuple(args.M * args.N,     // k_batch stride: jump to next K split
+                            args.final_stride_E, // stride same as final output stride
+                            1);
+
+    // Define kept and reduced dimensions
+    constexpr auto kept_dim    = ck_tile::sequence<1, 2>{}; // Keep M, N dimensions
+    constexpr auto reduce_dims = ck_tile::sequence<0>{};    // Reduce k_batch dimension
+
+    using ReduceOp   = ck_tile::ReduceOp::Add;
+    using BlockWarps = ck_tile::sequence<4, 1>;
+    using BlockTile  = ck_tile::sequence<128, 128>;
+    using WarpTile   = ck_tile::sequence<32, 128>;
+    using ThreadTile = ck_tile::sequence<8, 8>;
+
+    constexpr ck_tile::index_t kBlockSize  = 256;
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    ck_tile::index_t kGridSize = (output_size + BlockTile::at(ck_tile::number<0>{}) - 1) /
+                                 BlockTile::at(ck_tile::number<0>{});
+
+    using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+    using Problem =
+        ck_tile::Reduce2dProblem<CDataType, ComputeDataType, CDataType, Shape, ReduceOp>;
+    using Kernel = ck_tile::Reduce<Problem>;
+
+    if(!Kernel::IsSupportedArgument(reduce_dim_size, workspace_strides))
+    {
+        throw std::runtime_error("Wrong! Reduction arguments not supported!\n");
+    }
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Stage 2 - Launching Reduction kernel" << '\n'
+                  << "workspace shape: [" << args.k_batch << ", " << args.M << ", " << args.N << "]"
+                  << '\n'
+                  << "output shape: [" << args.M << ", " << args.N << "]" << '\n'
+                  << "grid size: " << kGridSize << std::endl;
+    }
+
+    float ave_time =
+        ck_tile::launch_kernel(s,
+                               ck_tile::make_kernel<kBlockPerCu>(
+                                   Kernel{},
+                                   kGridSize,
+                                   kBlockSize,
+                                   0,                                         // LDS size
+                                   static_cast<const CDataType*>(args.e_ptr), // workspace input
+                                   static_cast<CDataType*>(args.final_output_ptr), // final output
+                                   workspace_shape,
+                                   workspace_strides,
+                                   kept_dim,
+                                   reduce_dims));
+
+    return ave_time;
+}
+
+/**
+ * @brief Orchestrator for two-stage split-K GEMM implementation
+ *
+ * This function coordinates the two-stage approach:
+ * 1. Stage 1: Execute GEMM with each K-split writing to workspace
+ * 2. Stage 2: Reduce workspace results to final output (if k_batch > 1)
+ *
+ * @param args Extended arguments for two-stage execution
+ * @param s Stream configuration
+ * @return Total execution time (GEMM + Reduction)
+ */
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm_splitk_two_stage(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
+{
+    float gemm_time   = 0.0f;
+    float reduce_time = 0.0f;
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Starting Two-Stage GEMM+SplitK with k_batch=" << args.k_batch << std::endl;
+        std::cout << "Workspace size: " << args.k_batch << " x " << args.M << " x " << args.N
+                  << " = " << args.k_batch * args.M * args.N * sizeof(CDataType) << " bytes"
+                  << std::endl;
+    }
+
+    // Stage 1: GEMM to workspace
+    gemm_time = gemm_stage1<GemmConfig,
+                            ADataType,
+                            BDataType,
+                            DsDataType,
+                            AccDataType,
+                            CDataType,
+                            ALayout,
+                            BLayout,
+                            DsLayout,
+                            ELayout,
+                            Persistent,
+                            CDEElementWise>(args, s);
+
+    // Synchronize before stage 2
+    auto sync_result = hipStreamSynchronize(s.stream_id_);
+    if(sync_result != hipSuccess)
+    {
+        throw std::runtime_error("Stream synchronization failed");
+    }
+
+    // Stage 2: Reduction from workspace to final output (if needed)
+    if(args.k_batch > 1)
+    {
+        // Use appropriate precision for reduction computations
+        using ComputeDataType = std::conditional_t<
+            std::is_same_v<CDataType, ck_tile::half_t>,
+            float,
+            std::conditional_t<std::is_same_v<CDataType, ck_tile::bf16_t>, float, CDataType>>;
+        reduce_time = reduce_stage2<CDataType, ComputeDataType, ELayout>(args, s);
+    }
+    else
+    {
+        // Single K-split: simple copy from workspace to final output
+        auto copy_result = hipMemcpyAsync(args.final_output_ptr,
+                                          args.e_ptr,
+                                          args.M * args.N * sizeof(CDataType),
+                                          hipMemcpyDeviceToDevice,
+                                          s.stream_id_);
+        if(copy_result != hipSuccess)
+        {
+            throw std::runtime_error("Memory copy failed");
+        }
+    }
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "GEMM stage time: " << gemm_time << " ms" << std::endl;
+        if(args.k_batch > 1)
+        {
+            std::cout << "Reduction stage time: " << reduce_time << " ms" << std::endl;
+        }
+        std::cout << "Total time: " << gemm_time + reduce_time << " ms" << std::endl;
+    }
+
+    return gemm_time + reduce_time;
+}
+
+/**
+ * @brief High-level interface for two-stage split-K GEMM execution
+ *
+ * @param a_m_k_dev_buf Input matrix A device buffer
+ * @param b_k_n_dev_buf Input matrix B device buffer
+ * @param c_m_n_dev_buf Output matrix C device buffer
+ * @param M Matrix M dimension
+ * @param N Matrix N dimension
+ * @param K Matrix K dimension
+ * @param stride_A Memory stride for matrix A
+ * @param stride_B Memory stride for matrix B
+ * @param stride_C Memory stride for matrix C
+ * @param kbatch Number of K-splits for split-K execution
+ * @param n_warmup Number of warmup iterations
+ * @param n_repeat Number of repeat iterations for benchmarking
+ * @param persistent Whether to use persistent kernel execution
+ * @return Average execution time in milliseconds
+ */
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm_splitk_two_stage(ck_tile::DeviceMem& a_m_k_dev_buf,
+                                   ck_tile::DeviceMem& b_k_n_dev_buf,
+                                   ck_tile::DeviceMem& c_m_n_dev_buf,
+                                   ck_tile::index_t M,
+                                   ck_tile::index_t N,
+                                   ck_tile::index_t K,
+                                   ck_tile::index_t stride_A,
+                                   ck_tile::index_t stride_B,
+                                   ck_tile::index_t stride_C,
+                                   ck_tile::index_t kbatch,
+                                   int n_warmup,
+                                   int n_repeat,
+                                   bool persistent)
+{
+    // Calculate workspace size: kbatch * M * N elements
+    const ck_tile::index_t workspace_size   = kbatch * M * N * sizeof(CDataType);
+    const ck_tile::index_t workspace_stride = stride_C; // Stride for k_batch dimension
+
+    // Allocate workspace memory
+    ck_tile::DeviceMem workspace_buf(workspace_size);
+    workspace_buf.SetZero();
+
+    // Create extended args for two-stage approach
+    GemmSplitKHostArgs args{
+        a_m_k_dev_buf.GetDeviceBuffer(), // a_ptr
+        b_k_n_dev_buf.GetDeviceBuffer(), // b_ptr
+        workspace_buf.GetDeviceBuffer(), // workspace_ptr (used as e_ptr for stage 1)
+        c_m_n_dev_buf.GetDeviceBuffer(), // final_output_ptr
+        kbatch,                          // k_batch
+        M,
+        N,
+        K, // dimensions
+        stride_A,
+        stride_B,         // input strides
+        workspace_stride, // workspace stride
+        stride_C          // final output stride
+    };
+
+    float ave_time;
+    ck_tile::stream_config config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50};
+
+    if(persistent)
+    {
+        ave_time = gemm_splitk_two_stage<GemmConfig,
+                                         ADataType,
+                                         BDataType,
+                                         DsDataType,
+                                         AccDataType,
+                                         CDataType,
+                                         ALayout,
+                                         BLayout,
+                                         DsLayout,
+                                         CLayout,
+                                         true,
+                                         CDEElementWise>(args, config);
+    }
+    else
+    {
+        ave_time = gemm_splitk_two_stage<GemmConfig,
+                                         ADataType,
+                                         BDataType,
+                                         DsDataType,
+                                         AccDataType,
+                                         CDataType,
+                                         ALayout,
+                                         BLayout,
+                                         DsLayout,
+                                         CLayout,
+                                         false,
+                                         CDEElementWise>(args, config);
+    }
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Two-Stage GEMM+SplitK with M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " kbatch=" << kbatch << " WorkspaceSize=" << workspace_size << " bytes"
+              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
+              << " B_Type=" << DataTypeTraits<BDataType>::name
+              << " C_Type=" << DataTypeTraits<CDataType>::name
+              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
+              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl;
+
+    return ave_time;
+}
+
+// Two-stage implementation of run_gemm_example_with_layouts
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType = ADataType,
+          typename CDataType = ADataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+int run_gemm_example_with_layouts_two_stage(int argc,
+                                            char* argv[],
+                                            const ALayout a_layout                  = ALayout{},
+                                            const BLayout b_layout                  = BLayout{},
+                                            [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool persistent              = arg_parser.get_int("persistent");
+
+    const bool preshuffle = GemmConfig::Preshuffle;
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    if(init_method == 0)
+    {
+        if constexpr(preshuffle)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_k_n);
+        }
+        else
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        }
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        b_k_n.SetZero();
+    }
+
+    if(!preshuffle && GemmConfig::UseStructuredSparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    static_assert(!GemmConfig::PermuteA, "Not implemented");
+
+    if constexpr(preshuffle)
+    {
+        ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n);
+        // shuffled buffer B for device implementation
+        b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
+    }
+    else
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            if constexpr(GemmConfig::PermuteB)
+            {
+                permute_tensor_b<GemmConfig,
+                                 decltype(b_k_n_dev),
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 CLayout>(b_k_n_dev);
+            }
+            permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            if constexpr(GemmConfig::PermuteB)
+            {
+                std::cout << "Permute for this DataType is not implemented." << std::endl;
+                return false;
+            }
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+    }
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    std::cout << "Using Workspace Split-K Mode (Two-Stage with Reduction)" << std::endl;
+    // Use the new two-stage approach
+    invoke_gemm_splitk_two_stage<GemmConfig,
+                                 ADataType,
+                                 BDataType,
+                                 ck_tile::tuple<>,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 ck_tile::tuple<>,
+                                 CLayout>(a_m_k_dev_buf,
+                                          b_k_n_dev_buf,
+                                          c_m_n_dev_buf,
+                                          M,
+                                          N,
+                                          K,
+                                          stride_A,
+                                          stride_B,
+                                          stride_C,
+                                          kbatch,
+                                          n_warmup,
+                                          n_repeat,
+                                          persistent);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+        if constexpr(GemmConfig::Preshuffle)
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        // memory on host to store gpu reference result
+        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        // memory on device to store gpu reference result
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+
+        c_m_n_gpu_ref.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_gpu_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
+
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row                 = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col                 = ck_tile::tensor_layout::gemm::ColumnMajor;
+    auto [result, arg_parser] = create_args(argc, argv);
+    bool preshuffle           = GemmConfig::Preshuffle;
+
+    if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        throw std::runtime_error("Preshuffle is not supported for this int4 datatype!");
+    }
+
+    if(preshuffle && a_layout != "R" && b_layout != "C")
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+
+    // Use new two-stage approach for both int4 and other data types
+    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType,
+                                                           Row,
+                                                           Col,
+                                                           Row>(argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType,
+                                                           Col,
+                                                           Col,
+                                                           Row>(argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                     "BPrecType is ck_tile::pk_int4_t!");
+        }
+    }
+    else
+    {
+        if(a_layout == "R" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                argc, argv, Row{}, Row{}, Row{});
+        }
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                argc, argv, Col{}, Row{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+    return 0;
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "int8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
+                                          ck_tile::int8_t,
+                                          ck_tile::int8_t,
+                                          ck_tile::int32_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "pk_int4_t")
+    {
+        // TODO: Add support for bhalf_t ADataType
+        if constexpr(GemmConfig<ck_tile::half_t>::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>,
+                                              ck_tile::half_t,
+                                              ck_tile::pk_int4_t,
+                                              ck_tile::half_t>(a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+    return 0;
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        return !run_gemm_example<GemmConfigComputeV3>(argc, argv);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Caught runtime error: " << e.what() << '\n';
+        // Return a non-zero code to indicate failure
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
old mode 100644
new mode 100755
index cab110597b..eb0a6de8aa
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -34,6 +34,7 @@ constexpr ck_tile::index_t get_k_warp_tile()
         return 32;
 #endif
 }
+
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile_flatmm()
 {
@@ -171,6 +172,27 @@ struct GemmConfigComputeV3_2 : public GemmConfigBase
     static constexpr int kBlockPerCu = 2;
 };
 
+template <typename PrecType>
+struct GemmConfigComputeV3_WMMA : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
 template <typename PrecType>
 struct GemmConfigComputeV4 : public GemmConfigBase
 {
@@ -232,11 +254,11 @@ struct GemmConfigComputeV5 : public GemmConfigBase
 };
 
 template <typename PrecType>
-struct GemmConfigPreshuffle_1 : public GemmConfigBase
+struct GemmConfigPreshuffleDecode : public GemmConfigBase
 {
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
 
     static constexpr ck_tile::index_t M_Warp = 1;
     static constexpr ck_tile::index_t N_Warp = 4;
@@ -246,15 +268,15 @@ struct GemmConfigPreshuffle_1 : public GemmConfigBase
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
     static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
 
-    static constexpr int kBlockPerCu           = 2;
+    static constexpr int kBlockPerCu           = 1;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V1;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
     static constexpr bool Preshuffle           = true;
-    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr bool DoubleSmemBuffer     = true;
 };
 
 template <typename PrecType>
-struct GemmConfigPreshuffle_2 : public GemmConfigBase
+struct GemmConfigPreshufflePrefill : public GemmConfigBase
 {
     static constexpr ck_tile::index_t M_Tile = 128;
     static constexpr ck_tile::index_t N_Tile = 128;
@@ -470,12 +492,18 @@ auto create_args(int argc, char* argv[])
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
         .insert("persistent", "0", "0:non-persistent, 1:persistent")
         .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
-        .insert("rotating_count", "1", "rotating count, defaults to 1");
+        .insert("rotating_count", "1000", "rotating count, defaults to 1000");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
 
+// Type aliases for memory operation integral constants
+using MemoryOpSet =
+    std::integral_constant<ck_tile::memory_operation_enum, ck_tile::memory_operation_enum::set>;
+using MemoryOpAtomicAdd = std::integral_constant<ck_tile::memory_operation_enum,
+                                                 ck_tile::memory_operation_enum::atomic_add>;
+
 // host API
 template <typename ADataType,
           typename BDataType,
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
index 0a06787e2b..0018db2c99 100644
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -103,7 +103,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                                              DsLayout,
                                              ELayout,
                                              CDEElementWise,
-                                             UniversalGemmProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
                                              GemmConfig::M_Warp,
@@ -126,7 +125,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
         {
             grids = Kernel::GridSize(args.M, args.N, args.k_batch);
         }
-        constexpr dim3 blocks = Kernel::BlockSize();
+        dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
         {
@@ -141,7 +140,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                       << "pipeline: " << GemmPipeline::GetName() << '\n'
                       << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                       << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+                      << ", kBlockPerCu: {" << GemmConfig::kBlockPerCu << "}" << std::endl;
         }
         if(s.flush_cache_)
         {
@@ -172,15 +171,13 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
             ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 run_flush_cache,
-                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                    Kernel{}, grids, blocks, 0, kargs));
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         }
         else
         {
-            ave_time =
-                ck_tile::launch_kernel(s,
-                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                           Kernel{}, grids, blocks, 0, kargs));
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         }
         return ave_time;
     };
@@ -210,12 +207,13 @@ template <typename GemmConfig,
           typename APrecType,
           typename BPrecType = APrecType,
           typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
 {
-    using Row                 = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col                 = ck_tile::tensor_layout::gemm::ColumnMajor;
-    auto [result, arg_parser] = create_args(argc, argv);
-    bool preshuffle           = GemmConfig::Preshuffle;
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
 
     if(preshuffle && (a_layout != "R" || b_layout != "C"))
     {
@@ -226,7 +224,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     if(a_layout == "R" && b_layout == "C")
     {
         return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-            argc, argv, Row{}, Col{}, Row{});
+            arg_parser, Row{}, Col{}, Row{});
     }
     else
     {
@@ -235,12 +233,8 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
 }
 
 template <template <typename PreType> typename GemmConfig>
-int run_gemm_example(int argc, char* argv[])
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
@@ -248,26 +242,26 @@ int run_gemm_example(int argc, char* argv[])
     if(data_type == "fp16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "fp8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                           ck_tile::fp8_t,
                                           ck_tile::fp8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                           ck_tile::bf8_t,
                                           ck_tile::bf8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else
     {
@@ -277,9 +271,13 @@ int run_gemm_example(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
     try
     {
-        return !run_gemm_example<GemmConfigPreshuffle_2>(argc, argv);
+        return !run_gemm_example<GemmConfigPreshuffleDecode>(arg_parser);
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index cc10394065..229771e536 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -272,6 +272,25 @@ auto shuffle_b(const ck_tile::HostTensor<T>& t)
     return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
 }
 
+template <typename CDataType>
+bool do_verify(const ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+               const ck_tile::HostTensor<CDataType>& c_m_n_ref,
+               const ck_tile::tuple<double, double>& rtol_atol,
+               const char* variant)
+{
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_ref,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The " << variant << " verification result is:" << (pass ? "correct" : "fail")
+              << std::endl;
+    return pass;
+}
+
 template <typename GemmConfig,
           typename ADataType,
           typename BDataType = ADataType,
@@ -279,16 +298,11 @@ template <typename GemmConfig,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-int run_gemm_example_with_layouts(int argc,
-                                  char* argv[],
+int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
                                   const ALayout a_layout                  = ALayout{},
                                   const BLayout b_layout                  = BLayout{},
                                   [[maybe_unused]] const CLayout c_layout = CLayout{})
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
 
     ck_tile::index_t M = arg_parser.get_int("m");
@@ -430,28 +444,20 @@ int run_gemm_example_with_layouts(int argc,
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
 
+    // memory on host to store gpu reference result
+    ck_tile::HostTensor<CDataType> c_m_n_ref(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+    c_m_n_ref.SetZero();
+
     if(arg_parser.get_int("v") == 1)
     {
-        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        c_m_n_host_ref.SetZero();
-
         ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
-            a_m_k, b_k_n, c_m_n_host_ref);
+            a_m_k, b_k_n, c_m_n_ref);
         const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
         const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
             K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_host_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "CPU");
     }
     else if(arg_parser.get_int("v") == 2)
     {
@@ -465,13 +471,8 @@ int run_gemm_example_with_layouts(int argc,
             b_k_n_dev_buf.ToDevice(b_k_n.data());
         }
 
-        // memory on host to store gpu reference result
-        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
         // memory on device to store gpu reference result
-        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
-
-        c_m_n_gpu_ref.SetZero();
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_ref.get_element_space_size_in_bytes());
         c_m_n_gpu_buf_ref.SetZero();
 
         ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
@@ -486,21 +487,13 @@ int run_gemm_example_with_layouts(int argc,
                                     BLayout,
                                     CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
 
-        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_ref.data());
 
         const float max_accumulated_value =
-            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
         const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
             K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_gpu_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "GPU");
     }
 
     return pass;
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index d82520241d..4e01710b4d 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -103,7 +103,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                                              DsLayout,
                                              ELayout,
                                              CDEElementWise,
-                                             UniversalGemmProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
                                              GemmConfig::M_Warp,
@@ -127,7 +126,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
         {
             grids = Kernel::GridSize(args.M, args.N, args.k_batch);
         }
-        constexpr dim3 blocks = Kernel::BlockSize();
+        dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
         {
@@ -173,15 +172,13 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
             ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 run_flush_cache,
-                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                    Kernel{}, grids, blocks, 0, kargs));
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         }
         else
         {
-            ave_time =
-                ck_tile::launch_kernel(s,
-                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                           Kernel{}, grids, blocks, 0, kargs));
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         }
         return ave_time;
     };
@@ -189,17 +186,11 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
         if(args.k_batch == 1)
         {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::set>{});
+            Run(has_hot_loop_, tail_number_, MemoryOpSet{});
         }
         else
         {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::atomic_add>{});
+            Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
         }
     };
 
@@ -211,12 +202,13 @@ template <typename GemmConfig,
           typename APrecType,
           typename BPrecType = APrecType,
           typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
 {
-    using Row                 = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col                 = ck_tile::tensor_layout::gemm::ColumnMajor;
-    auto [result, arg_parser] = create_args(argc, argv);
-    bool preshuffle           = GemmConfig::Preshuffle;
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
 
     if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
     {
@@ -234,12 +226,12 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
         if(a_layout == "R" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+                arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+                arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -252,22 +244,22 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
         if(a_layout == "R" && b_layout == "R")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Row{}, Row{});
+                arg_parser, Row{}, Row{}, Row{});
         }
         else if(a_layout == "R" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+                arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "R")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Row{}, Row{});
+                arg_parser, Col{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+                arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -277,12 +269,8 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
 }
 
 template <template <typename PreType> typename GemmConfig>
-int run_gemm_example(int argc, char* argv[])
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
@@ -290,33 +278,33 @@ int run_gemm_example(int argc, char* argv[])
     if(data_type == "fp16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "fp8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                           ck_tile::fp8_t,
                                           ck_tile::fp8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                           ck_tile::bf8_t,
                                           ck_tile::bf8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "int8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
                                           ck_tile::int8_t,
                                           ck_tile::int8_t,
-                                          ck_tile::int32_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::int32_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "pk_int4_t")
     {
@@ -326,7 +314,7 @@ int run_gemm_example(int argc, char* argv[])
             return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>,
                                               ck_tile::half_t,
                                               ck_tile::pk_int4_t,
-                                              ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                              ck_tile::half_t>(a_layout, b_layout, arg_parser);
         }
         else
         {
@@ -341,9 +329,13 @@ int run_gemm_example(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
     try
     {
-        return !run_gemm_example<GemmConfigComputeV3>(argc, argv);
+        return !run_gemm_example<GemmConfigComputeV3>(arg_parser);
     }
     catch(const std::runtime_error& e)
     {
@@ -351,5 +343,4 @@ int main(int argc, char* argv[])
         // Return a non-zero code to indicate failure
         return EXIT_FAILURE;
     }
-    return EXIT_SUCCESS;
 }
diff --git a/example/ck_tile/04_img2col/README.md b/example/ck_tile/04_img2col/README.md
index df5c51a9c0..3b1b6f999b 100644
--- a/example/ck_tile/04_img2col/README.md
+++ b/example/ck_tile/04_img2col/README.md
@@ -7,7 +7,7 @@ This folder contains example for Image to Column using ck_tile tile-programming
 # in the root of ck_tile
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_img2col -j
 ```
 This will result in an executable `build/bin/tile_example_img2col`
diff --git a/example/ck_tile/04_img2col/image_to_column.cpp b/example/ck_tile/04_img2col/image_to_column.cpp
index 299a2f3444..22b5d640d8 100644
--- a/example/ck_tile/04_img2col/image_to_column.cpp
+++ b/example/ck_tile/04_img2col/image_to_column.cpp
@@ -55,13 +55,12 @@ float image_to_column(const image_to_column_traits& traits,
             args.N * args.output_spatial_lengths[0] * args.output_spatial_lengths[1],
             args.filter_spatial_lengths[0] * args.filter_spatial_lengths[1] * args.C,
             args.G);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 blocks = Kernel::BlockSize();
 
         constexpr ck_tile::index_t kBlockPerCu = 2;
 
         float ave_time = ck_tile::launch_kernel(
-            stream_conf,
-            ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            stream_conf, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     }
diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp
index cf816caa88..a110c2f98d 100644
--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -94,18 +94,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
         throw std::runtime_error("Wrong! Arguments not supported!\n");
     }
 
-    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
-                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                       Kernel{},
-                                       kGridSize,
-                                       kBlockSize,
-                                       0,
-                                       static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
-                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
-                                       input_shape,
-                                       input_strides,
-                                       kept_dim,
-                                       reduce_dims));
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        ck_tile::make_kernel<kBlockPerCu>(Kernel{},
+                                          kGridSize,
+                                          kBlockSize,
+                                          0,
+                                          static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                          static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                          input_shape,
+                                          input_strides,
+                                          kept_dim,
+                                          reduce_dims));
 
     std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;
 
diff --git a/example/ck_tile/06_permute/README.md b/example/ck_tile/06_permute/README.md
index 03bd810ff4..5e88e71572 100644
--- a/example/ck_tile/06_permute/README.md
+++ b/example/ck_tile/06_permute/README.md
@@ -15,7 +15,7 @@ args:
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
 make tile_example_permute -j
 ```
 This will result in an executable `build/bin/tile_example_permute`
diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
index 688f4f3d50..d486196fc3 100644
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -115,11 +115,12 @@ struct matrix_core_swizzle_kernel
 
     __host__ void operator()(const ck_tile::stream_config& s) const
     {
-        ck_tile::kentry<BLOCK_SIZE, 1, kernel><<<grids, BLOCK_SIZE, 0, s.stream_id_>>>(a);
+        ck_tile::kentry<1, kernel><<<grids, BLOCK_SIZE, 0, s.stream_id_>>>(a);
     }
 
     struct kernel
     {
+        static constexpr int kBlockSize = BLOCK_SIZE;
         __device__ static constexpr auto get_src_dist()
         {
             using namespace ck_tile;
diff --git a/example/ck_tile/06_permute/permute.cpp b/example/ck_tile/06_permute/permute.cpp
index 477ae370b9..aafece0f25 100644
--- a/example/ck_tile/06_permute/permute.cpp
+++ b/example/ck_tile/06_permute/permute.cpp
@@ -53,11 +53,11 @@ float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s)
 
         auto kargs = Kernel::MakeKargs(a);
 
-        const dim3 grids      = Kernel::GridSize(a);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(a);
+        const dim3 blocks = Kernel::BlockSize();
 
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+        float ave_time =
+            ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     }
@@ -69,11 +69,11 @@ float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s)
 
         auto kargs = Kernel::MakeKargs(a);
 
-        const dim3 grids      = Kernel::GridSize(a);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(a);
+        const dim3 blocks = Kernel::BlockSize();
 
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+        float ave_time =
+            ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     }
@@ -85,11 +85,11 @@ float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s)
 
         auto kargs = Kernel::MakeKargs(a);
 
-        const dim3 grids      = Kernel::GridSize(a);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(a);
+        const dim3 blocks = Kernel::BlockSize();
 
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+        float ave_time =
+            ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     }
diff --git a/example/ck_tile/09_topk_softmax/README.md b/example/ck_tile/09_topk_softmax/README.md
index 1043012900..2e15aeaae5 100644
--- a/example/ck_tile/09_topk_softmax/README.md
+++ b/example/ck_tile/09_topk_softmax/README.md
@@ -6,7 +6,7 @@ This folder contains example for topk-softmax kernel using ck_tile tile-programm
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
 make tile_example_topk_softmax -j
 ```
 This will result in an executable `build/bin/tile_example_topk_softmax`
diff --git a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
index 249a307b81..c2bad24cfe 100644
--- a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
+++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
@@ -13,11 +13,11 @@
                                                                                                 \
     auto kargs = kernel::MakeKargs(a);                                                          \
                                                                                                 \
-    const dim3 grids      = kernel::GridSize(a);                                                \
-    constexpr dim3 blocks = kernel::BlockSize();                                                \
+    const dim3 grids  = kernel::GridSize(a);                                                    \
+    const dim3 blocks = kernel::BlockSize();                                                    \
                                                                                                 \
-    float ave_time = ck_tile::launch_kernel(                                                    \
-        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));               \
+    float ave_time =                                                                            \
+        ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(kernel{}, grids, blocks, 0, kargs));  \
                                                                                                 \
     return ave_time;
 
diff --git a/example/ck_tile/10_rmsnorm2d/README.md b/example/ck_tile/10_rmsnorm2d/README.md
index c067496477..1d27ad153e 100644
--- a/example/ck_tile/10_rmsnorm2d/README.md
+++ b/example/ck_tile/10_rmsnorm2d/README.md
@@ -6,7 +6,7 @@ This folder contains example for Rmsnorm2D forward using ck_tile tile-programmin
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
 make tile_rmsnorm2d_fwd -j
 ```
 This will result in an executable `build/bin/tile_rmsnorm2d_fwd`
diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
index e0a71452ea..511efeeaec 100644
--- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
@@ -138,12 +138,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
     auto kargs = Kernel::MakeKargs(args);
 
     const dim3 grids                       = Kernel::GridSize(args);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
     auto s = ck_tile::stream_config{nullptr, true, 0, warmup, repeat};
 
-    ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+    ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
     bool pass = true;
 
diff --git a/example/ck_tile/10_rmsnorm2d/generate.py b/example/ck_tile/10_rmsnorm2d/generate.py
index b0ba400af1..ea8dfdf9ce 100644
--- a/example/ck_tile/10_rmsnorm2d/generate.py
+++ b/example/ck_tile/10_rmsnorm2d/generate.py
@@ -249,7 +249,7 @@ float rmsnorm2d_fwd_(const S& s, A a)
     using Kernel = ck_tile::Rmsnorm2dFwd<Pipeline, Epilogue>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
@@ -257,7 +257,7 @@ float rmsnorm2d_fwd_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
 }}
 
 """
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md
index 960369b78d..f9ba76c9e3 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md
@@ -6,7 +6,7 @@ This folder contains example for add + Rmsnorm2D + rowwise dynamic quantization
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
 make tile_add_rmsnorm2d_rdquant_fwd -j
 ```
 This will result in an executable `build/bin/tile_add_rmsnorm2d_rdquant_fwd`
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
index 449bc17e04..ace5fe0c4f 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -136,12 +136,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
     auto kargs = Kernel::MakeKargs(args);
 
     const dim3 grids                       = Kernel::GridSize(args);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
     auto s = ck_tile::stream_config{nullptr, true, 0, warmup, repeat};
 
-    ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+    ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
     bool pass = true;
 
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
index 25b10e1dc4..d997596414 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
@@ -58,7 +58,7 @@ float add_rmsnorm2d_rdquant_fwd_(const S& s, A a)
     using Kernel = ck_tile::AddRmsnorm2dRdquantFwd<Pipeline>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
@@ -66,5 +66,5 @@ float add_rmsnorm2d_rdquant_fwd_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 }
diff --git a/example/ck_tile/12_smoothquant/README.md b/example/ck_tile/12_smoothquant/README.md
index d6b815f8cf..6b3acd558b 100644
--- a/example/ck_tile/12_smoothquant/README.md
+++ b/example/ck_tile/12_smoothquant/README.md
@@ -6,7 +6,7 @@ This folder contains example for smoothquant using ck_tile tile-programming impl
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
 make tile_smoothquant -j
 ```
 This will result in an executable `build/bin/tile_smoothquant`
diff --git a/example/ck_tile/12_smoothquant/example_smoothquant.cpp b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
index 5fcacacee8..e688947d71 100644
--- a/example/ck_tile/12_smoothquant/example_smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
@@ -126,12 +126,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
     auto kargs = Kernel::MakeKargs(args);
 
     const dim3 grids                       = Kernel::GridSize(args);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
     auto s = ck_tile::stream_config{nullptr, true, 1, warmup, repeat};
 
-    ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+    ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
     bool pass = true;
 
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp b/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp
index 555159566e..873a474afb 100644
--- a/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp
@@ -50,7 +50,7 @@ float smoothquant_(const S& s, A a)
     using Kernel = ck_tile::Smoothquant<Pipeline>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
@@ -58,5 +58,5 @@ float smoothquant_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 }
diff --git a/example/ck_tile/13_moe_sorting/README.md b/example/ck_tile/13_moe_sorting/README.md
index 1822ff3a37..c99f40aa57 100644
--- a/example/ck_tile/13_moe_sorting/README.md
+++ b/example/ck_tile/13_moe_sorting/README.md
@@ -6,7 +6,7 @@ This folder contains example for moe-sorting kernel using ck_tile tile-programmi
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
 make tile_example_moe_sorting -j
 ```
 This will result in an executable `build/bin/tile_example_moe_sorting`
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
index a71c5e51a6..d614b8462a 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -209,7 +209,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
         const dim3 blocks                     = kernel::BlockSize(a);                               \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
     }()
 
 #define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
@@ -227,7 +227,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
         const dim3 blocks                     = kernel::BlockSize(a);                               \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
     }()
 #if MOE_SORTING_SUPPORT_LARGE_EXPERT
 #define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
@@ -283,7 +283,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         const dim3 grids                      = kernel::GridSize(a);                                 \
         const dim3 blocks                     = kernel::BlockSize(a);                                \
         const auto lds_size                   = kernel::GetSmemSize(a);                              \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, lds_size, kargs);                       \
     }()
 
 #define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)            \
@@ -334,15 +334,15 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         }                                                                                          \
     }
 
-#define MOR_SORTING_CLEAR_WS_DISPATCH_(is_local_token_, block_size_, occu_)                 \
-    [&]() {                                                                                 \
-        using problem_ =                                                                    \
-            ck_tile::MoeSortingClearWorkspaceProblem<is_local_token_, block_size_, occu_>;  \
-        using kernel      = ck_tile::MoeSortingClearWorkspaceKernel<problem_>;              \
-        auto kargs        = kernel::MakeKargs(a);                                           \
-        const dim3 grids  = kernel::GridSize(a);                                            \
-        const dim3 blocks = kernel::BlockSize(a);                                           \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs); \
+#define MOR_SORTING_CLEAR_WS_DISPATCH_(is_local_token_, block_size_, occu_)                \
+    [&]() {                                                                                \
+        using problem_ =                                                                   \
+            ck_tile::MoeSortingClearWorkspaceProblem<is_local_token_, block_size_, occu_>; \
+        using kernel      = ck_tile::MoeSortingClearWorkspaceKernel<problem_>;             \
+        auto kargs        = kernel::MakeKargs(a);                                          \
+        const dim3 grids  = kernel::GridSize(a);                                           \
+        const dim3 blocks = kernel::BlockSize(a);                                          \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                    \
     }()
 
 float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
diff --git a/example/ck_tile/14_moe_smoothquant/README.md b/example/ck_tile/14_moe_smoothquant/README.md
index 599b4c3489..c10a922607 100644
--- a/example/ck_tile/14_moe_smoothquant/README.md
+++ b/example/ck_tile/14_moe_smoothquant/README.md
@@ -9,7 +9,7 @@ Unlike standard smoothquant op, the input scale is from different expert `[exper
 ```
 # in the root of ck_tile
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
 make tile_example_moe_smoothquant -j
 ```
 This will result in an executable `build/bin/tile_example_moe_smoothquant`
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
index 885d9ff7bf..607217ea52 100644
--- a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
@@ -53,7 +53,7 @@ float moe_smoothquant_(const S& s, A a)
     using Kernel = ck_tile::MoeSmoothquant<Pipeline>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
@@ -61,5 +61,5 @@ float moe_smoothquant_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 }
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
index 6e54df9fde..9d1675386f 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
@@ -53,7 +53,7 @@ float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a)
     using f_kernel      = ck_tile::FusedMoeGemmKernel<f_partitioner, f_pipeline, void>;
 
     const dim3 grids                       = f_kernel::GridSize(a);
-    constexpr dim3 blocks                  = f_kernel::BlockSize();
+    const dim3 blocks                      = f_kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     static int printed = 0;
@@ -66,5 +66,5 @@ float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a)
     }
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(f_kernel{}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(f_kernel{}, grids, blocks, 0, kargs));
 }
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
index 5f87393a0a..441aa84edf 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -213,7 +213,7 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
         const dim3 blocks                     = kernel::BlockSize(a);                               \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
     }()
 
 #define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
@@ -231,7 +231,7 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
         const dim3 blocks                     = kernel::BlockSize(a);                               \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
     }()
 #if MOE_SORTING_SUPPORT_LARGE_EXPERT
 #define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
@@ -287,7 +287,7 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         const dim3 grids                      = kernel::GridSize(a);                                 \
         const dim3 blocks                     = kernel::BlockSize(a);                                \
         const auto lds_size                   = kernel::GetSmemSize(a);                              \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, lds_size, kargs);                       \
     }()
 
 #define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)            \
diff --git a/example/ck_tile/16_batched_gemm/README.md b/example/ck_tile/16_batched_gemm/README.md
index 34b56db526..8a64a3912c 100644
--- a/example/ck_tile/16_batched_gemm/README.md
+++ b/example/ck_tile/16_batched_gemm/README.md
@@ -7,7 +7,7 @@ This folder contains example for batched GEMM using ck_tile tile-programming imp
 # in the root of ck_tile
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_batched_gemm -j
 ```
 This will result in an executable `build/bin/tile_example_batched_gemm`
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
index 9616abb800..09ba010e00 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -142,7 +142,6 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
                                                  DsLayout,
                                                  CLayout,
                                                  CDEElementWise,
-                                                 GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
                                                  M_Warp,
@@ -156,8 +155,8 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
             using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
-            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
+            const dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
             {
@@ -176,7 +175,7 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
             }
 
             ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
             return ave_time;
         };
 
diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md
index 29642e96c1..8715ee79e1 100644
--- a/example/ck_tile/17_grouped_gemm/README.md
+++ b/example/ck_tile/17_grouped_gemm/README.md
@@ -148,7 +148,7 @@ All the necessary parameters are set, the tiling is computed, the GEMM pipeline
 # in the root of ck_tile
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 # The basic pipeline method on the gemm calculation
 make tile_example_grouped_gemm -j
 ```
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index 897952f03c..527ef1e466 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -16,91 +16,43 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename GemmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType>
 float grouped_gemm_tileloop(const ck_tile::stream_config& s,
                             const ck_tile::index_t num_groups,
                             void* kargs_ptr,
                             bool splitk)
 {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 32;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 1;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
-
-    constexpr bool DoubleSmemBuffer = false;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = true;
-#endif
-
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu                         = 1;
     constexpr ck_tile::index_t TileParitionerGroupNum = 8;
     constexpr ck_tile::index_t TileParitionerM01      = 4;
 
-    using GemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
     using TilePartitioner = ck_tile::
         GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
-    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<kPadM,
-                                                                           kPadN,
-                                                                           kPadK,
-                                                                           DoubleSmemBuffer,
-                                                                           ALayout,
-                                                                           BLayout,
-                                                                           CLayout>;
-    using GemmPipelineProblem =
-        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+    using GemmUniversalTraits =
+        ck_tile::PersistentTileGemmUniversalTraits<GemmConfig::kPadM,
+                                                   GemmConfig::kPadN,
+                                                   GemmConfig::kPadK,
+                                                   GemmConfig::DoubleSmemBuffer,
+                                                   ALayout,
+                                                   BLayout,
+                                                   CLayout>;
 
     float ave_time{0};
 
     const auto Run = [&](const auto memory_operation_) {
-        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
         constexpr auto memory_operation = memory_operation_.value;
 
         // We create the GEMM pipeline without specifying hotloop or tailnumber.
@@ -112,7 +64,8 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
                                                                            GemmUniversalTraits,
                                                                            scheduler>;
 
-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
@@ -122,19 +75,18 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
                                              ck_tile::tuple<>,
                                              CLayout,
                                              ck_tile::element_wise::PassThrough,
-                                             GemmPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
                                              UniversalGemmProblem::TransposeC,
                                              memory_operation>>;
-        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        constexpr dim3 blocks = Kernel::BlockSize();
-        const dim3 grids      = Kernel::MaxOccupancyGridSize(s);
+        using Kernel      = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        const dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::MaxOccupancyGridSize(s);
 
         if(s.log_level_ > 0)
         {
@@ -145,7 +97,7 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
 
         ave_time =
             ck_tile::launch_kernel(s,
-                                   ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                   ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
                                        Kernel{},
                                        grids,
                                        blocks,
@@ -173,4 +125,7 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
 #include "run_grouped_gemm_example.inc"
 
 constexpr bool Persistent = true;
-int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
+int main(int argc, char* argv[])
+{
+    return !run_grouped_gemm_example<Persistent, GemmConfigComputeV4>(argc, argv);
+}
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index 89d91fbef6..e992cb3118 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -15,24 +15,26 @@
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 
 #ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V4
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
 #endif
 
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(CK_GFX950_SUPPORT)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
 #else
-#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
 #endif
+}
 
 template <typename DataType>
 struct GemmTypeConfig;
@@ -46,13 +48,109 @@ struct GemmTypeConfig<ck_tile::half_t>
     using AccDataType = float;
 };
 
-using Types = GemmTypeConfig<ck_tile::half_t>;
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
 
-// Specific type aliases for easy access
-using ADataType   = Types::ADataType;
-using BDataType   = Types::BDataType;
-using AccDataType = Types::AccDataType;
-using CDataType   = Types::CDataType;
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 1;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
 
 using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
 
@@ -69,6 +167,7 @@ auto create_args(int argc, char* argv[])
         .insert("b_layout", "C", "B tensor data layout - Row by default.")
         .insert("c_layout", "R", "C tensor data layout - Row by default.")
         .insert("validate", "1", "0. No validation, 1. Validation on CPU.")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
         .insert("warmup", "10", "number of iterations before benchmark the kernel.")
         .insert("repeat", "100", "number of iterations to benchmark the kernel.")
         .insert("group_count", "8", "group count.")
@@ -98,7 +197,14 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                    const ck_tile::stream_config& s,
                    void* kargs_ptr);
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename GemmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType>
 float grouped_gemm_tileloop(const ck_tile::stream_config& s,
                             const ck_tile::index_t num_groups,
                             void* kargs_ptr,
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index fa7f1a31c1..425299203f 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -10,6 +10,7 @@ static constexpr inline auto is_row_major(Layout layout_)
                                                  ck_tile::tensor_layout::gemm::RowMajor>>{};
 }
 
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
                          const ck_tile::index_t kbatch,
                          const float max_accumulated_value)
@@ -30,7 +31,8 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -102,8 +104,14 @@ float invoke_gemm(int n_warmup,
                                             kargs.size() * sizeof(ck_tile::GemmTransKernelArg),
                                             hipMemcpyHostToDevice,
                                             stream.stream_id_));
-        ave_time = grouped_gemm_tileloop<ALayout, BLayout, CLayout>(
-            stream, group_count, kargs_ptr, splitk);
+        ave_time = grouped_gemm_tileloop<GemmConfig,
+                                         ALayout,
+                                         BLayout,
+                                         CLayout,
+                                         ADataType,
+                                         BDataType,
+                                         AccDataType,
+                                         CDataType>(stream, group_count, kargs_ptr, splitk);
     }
 
     std::string op_name{"Grouped Gemm"};
@@ -127,7 +135,15 @@ float invoke_gemm(int n_warmup,
     return ave_time;
 }
 
-template <bool Persistent, typename ALayout, typename BLayout, typename CLayout>
+template <bool Persistent,
+          typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 int run_grouped_gemm_example_with_layouts(int argc,
                                           char* argv[],
                                           const ALayout a_layout                  = ALayout{},
@@ -243,7 +259,8 @@ int run_grouped_gemm_example_with_layouts(int argc,
             {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
     }
 
-    invoke_gemm<ADataType,
+    invoke_gemm<GemmConfig,
+                ADataType,
                 BDataType,
                 ck_tile::tuple<>,
                 AccDataType,
@@ -271,7 +288,9 @@ int run_grouped_gemm_example_with_layouts(int argc,
                 a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref);
             const float max_accumulated_value =
                 *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-            const auto rtol_atol = calculate_rtol_atol(Ks[i], kbatch, max_accumulated_value);
+            const auto rtol_atol =
+                calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                    Ks[i], kbatch, max_accumulated_value);
             pass &= ck_tile::check_err(c_m_n_tensors[i],
                                        c_m_n_host_ref,
                                        "Error: Incorrect results!",
@@ -288,7 +307,61 @@ int run_grouped_gemm_example_with_layouts(int argc,
     return pass;
 }
 
-template <bool Persistent>
+template <bool Persistent, typename GemmConfig, typename PrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row   = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col   = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using Types = GemmTypeConfig<PrecType>;
+    // Specific type aliases for easy access
+    using ADataType   = typename Types::ADataType;
+    using BDataType   = typename Types::BDataType;
+    using AccDataType = typename Types::AccDataType;
+    using CDataType   = typename Types::CDataType;
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Row{}, Col{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Row{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Col{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
+
+template <bool Persistent, template <typename PrecType> typename GemmConfig>
 int run_grouped_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -297,30 +370,22 @@ int run_grouped_gemm_example(int argc, char* argv[])
         return -1;
     }
 
-    const std::string a_layout = arg_parser.get_str("a_layout");
-    const std::string b_layout = arg_parser.get_str("b_layout");
+    const std::string a_layout  = arg_parser.get_str("a_layout");
+    const std::string b_layout  = arg_parser.get_str("b_layout");
+    const std::string data_type = arg_parser.get_str("prec");
 
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    if(a_layout == "R" && b_layout == "C")
+    if(data_type == "fp16")
     {
-        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Row{}, Col{}, Row{});
+        return run_gemm_example_prec_type<Persistent, GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
     }
-    else if(a_layout == "R" && b_layout == "R")
+    else if(data_type == "fp8")
     {
-        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Row{}, Row{}, Row{});
-    }
-    else if(a_layout == "C" && b_layout == "R")
-    {
-        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Col{}, Row{}, Row{});
-    }
-    else if(a_layout == "C" && b_layout == "C")
-    {
-        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Col{}, Col{}, Row{});
+        return run_gemm_example_prec_type<Persistent, GemmConfig<ck_tile::fp8_t>, ck_tile::fp8_t>(
+            a_layout, b_layout, argc, argv);
     }
     else
     {
-        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+        throw std::runtime_error("Unsupported data type configuration.");
     }
 }
diff --git a/example/ck_tile/18_flatmm/README.md b/example/ck_tile/18_flatmm/README.md
index beaac785fc..eeaa7658bd 100644
--- a/example/ck_tile/18_flatmm/README.md
+++ b/example/ck_tile/18_flatmm/README.md
@@ -7,7 +7,7 @@ This folder contains example for FLATMM using ck_tile tile-programming implement
 # in the root of ck_tile
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 # The basic pipeline method on the flatmm calculation
 make tile_example_flatmm_basic -j
 ```
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 475a0c7bf3..93117e5b75 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -101,7 +101,6 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c
                                              DsLayout,
                                              ELayout,
                                              CDEElementWise,
-                                             CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
                                              FlatmmConfig::M_Warp,
@@ -119,8 +118,8 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c
 
         auto kargs = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
         {
@@ -171,15 +170,13 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c
             ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 run_flush_cache,
-                ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
-                    Kernel{}, grids, blocks, 0, kargs));
+                ck_tile::make_kernel<FlatmmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         }
         else
         {
-            ave_time =
-                ck_tile::launch_kernel(s,
-                                       ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
-                                           Kernel{}, grids, blocks, 0, kargs));
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<FlatmmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         }
         return ave_time;
     };
@@ -217,6 +214,17 @@ int run_flatmm_example(int argc, char* argv[])
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
+
+    int k        = arg_parser.get_int("k");
+    int stride_b = arg_parser.get_int("stride_b");
+
+    if(b_layout == "C" && stride_b > k)
+    {
+        throw std::runtime_error(
+            "For ColumnMajor layout, StrideB must be smaller than or equal to K (" +
+            std::to_string(k) + ")");
+    }
+
     if(a_layout == "R" && b_layout == "C")
     {
 
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 8f39b07be5..013db6715d 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -42,7 +42,9 @@ auto shuffle_b(const ck_tile::HostTensor<T>& t)
     assert(t.get_lengths().size() == 2);
     int n_                = t.get_lengths()[1];
     int k_                = t.get_lengths()[0];
-    constexpr int divisor = FlatmmConfig::N_Warp_Tile == 32 ? 2 : 4;
+
+    int divisor = ck_tile::is_wave32() ? (FlatmmConfig::N_Warp_Tile == 32 ? 1 : 2)
+                                       : (FlatmmConfig::N_Warp_Tile == 32 ? 2 : 4);
     ck_tile::HostTensor<T> t_view({n_ / FlatmmConfig::N_Warp_Tile,
                                    FlatmmConfig::N_Warp_Tile,
                                    k_ / FlatmmConfig::K_Warp_Tile,
@@ -213,6 +215,16 @@ int run_flatmm_example_with_layouts(int argc,
         ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_host);
         ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_origin_host);
     }
+    else if(init_method == 3)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_origin_host);
+    }
+    else if(init_method == 4)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_host);
+        ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
+    }
     else
     {
         a_host.SetZero();
diff --git a/example/ck_tile/19_gemm_multi_d/README.md b/example/ck_tile/19_gemm_multi_d/README.md
index 7e8cd87546..2cf2b1ea03 100644
--- a/example/ck_tile/19_gemm_multi_d/README.md
+++ b/example/ck_tile/19_gemm_multi_d/README.md
@@ -8,7 +8,7 @@ This folder contains example for Multiple D GEMM using ck_tile tile-programming
 mkdir build && cd build
 #you can replace < arch> with the appropriate architecture(for example gfx90a or gfx942) or \
     leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 #The basic pipeline method on the gemm calculation
 make tile_example_gemm_multi_d_fp16 -j
 ```
diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
index 8971871c14..fc52cb66cc 100644
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
@@ -146,7 +146,6 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
                                                  DsLayout,
                                                  CLayout,
                                                  CDEElementWise,
-                                                 GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
                                                  M_Warp,
@@ -160,8 +159,8 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
             using Kernel = ck_tile::GemmKernelMultiD<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
             {
@@ -176,7 +175,7 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
             }
 
             ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
             return ave_time;
         };
 
@@ -197,95 +196,7 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
         }
     };
 
-    if(has_hot_loop)
-    {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "For compute pipeline tail number should always be Full, but have \"" << tail_num
-                << "\" which is not supported! PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-        if(tail_num == ck_tile::TailNumber::One)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-
-        auto check_tail = [&](auto... TNs) {
-            (try_run<BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
-        };
-
-        check_tail(ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-        if(tail_num == ck_tile::TailNumber::Three)
-        {
-            RunSplitk(
-                ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-        }
-        else
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-        }
-#endif
-    }
-    else
-    {
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Num K loop must be larger than number of prefetech stages."
-                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-    }
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
 
     return ave_time;
 }
diff --git a/example/ck_tile/20_grouped_convolution/CMakeLists.txt b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
index c05dcac09c..5cb1d2650e 100644
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -6,3 +6,6 @@ target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMP
 
 add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp)
 target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+add_executable(tile_example_grouped_conv_bwd_data EXCLUDE_FROM_ALL grouped_convolution_backward_data.cpp)
+target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
new file mode 100644
index 0000000000..308961de5a
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "grouped_convolution_utils.hpp"
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DsDataType     = ck_tile::tuple<>,
+          typename DsLayout       = ck_tile::tuple<>,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args,
+                            const ck_tile::stream_config& s)
+{
+    constexpr int kBlockPerCu = 1;
+
+    constexpr ck_tile::index_t M_Tile = 64;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 32;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr ck_tile::index_t VectorSizeA = 8;
+    constexpr ck_tile::index_t VectorSizeB = 8;
+    constexpr ck_tile::index_t VectorSizeC = 8;
+
+    // Implicit GEMM Traits
+    using CodegenShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
+    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+    using GroupedConvTraitsType =
+        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
+    using CodegenPipelineProblem =
+        ck_tile::GemmPipelineProblem<InDataType,
+                                     WeiDataType,
+                                     AccDataType,
+                                     CodegenShape,
+                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
+                                     InDataType,
+                                     true,
+                                     VectorSizeA,
+                                     VectorSizeB>;
+    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using ConvEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<InDataType,
+                                             WeiDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                                             ck_tile::tensor_layout::gemm::RowMajor,
+                                             CDEElementWise,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation,
+                                             1,
+                                             true,
+                                             VectorSizeC>>;
+
+        using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
+                                                                     TilePartitioner,
+                                                                     CodegenPipeline,
+                                                                     ConvEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << '\n'
+                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    if(args.k_batch == 1)
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::atomic_add>{});
+    }
+}
+
+#include "run_grouped_convolution_bwd_data_example.inc"
+
+template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
+int run_grouped_conv_bwd_data_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<1>{},
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<2>{},
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<3>{},
+                                                              InPrecType,
+                                                              WeiPrecType,
+                                                              OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
+
+int run_grouped_conv_bwd_data_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type  = arg_parser.get_str("prec");
+    std::string in_layout  = arg_parser.get_str("in_layout");
+    std::string wei_layout = arg_parser.get_str("wei_layout");
+    std::string out_layout = arg_parser.get_str("out_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_grouped_conv_bwd_data_example_prec_type<ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_grouped_conv_bwd_data_example_prec_type<ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_data_example(argc, argv); }
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
index 67db775e09..debbb6bc0c 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
@@ -78,7 +78,6 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
                                              typename GroupedConvTraitsType::ImplicitGemmDsLayout,
                                              ck_tile::tensor_layout::gemm::RowMajor,
                                              CDEElementWise,
-                                             CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
                                              M_Warp,
@@ -98,8 +97,8 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
                                                                        ConvEpilogue>;
         auto kargs   = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(kargs);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(kargs);
+        const dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
         {
@@ -123,7 +122,7 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
         float ave_time = ck_tile::launch_kernel_time_mask(
             s,
             Kernel::Preprocess(kargs, s),
-            ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     };
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
index ce19c77bc1..6700970583 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -77,7 +77,6 @@ float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_til
                                              typename GroupedConvTraitsType::ImplicitGemmDsLayout,
                                              ck_tile::tensor_layout::gemm::RowMajor,
                                              CDEElementWise,
-                                             CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
                                              M_Warp,
@@ -97,8 +96,8 @@ float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_til
                                                                 ConvEpilogue>;
         auto kargs   = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(kargs);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(kargs);
+        const dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
         {
@@ -120,7 +119,7 @@ float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_til
         }
 
         float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     };
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
new file mode 100644
index 0000000000..3e1c13c833
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args,
+                                     int n_warmup,
+                                     int n_repeat)
+{
+    float ave_time = grouped_conv_bwd_data<NDimSpatial,
+                                             InDataType,
+                                             WeiDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             InLayout,
+                                             WeiLayout,
+                                             OutLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop     = args.GetFlops();
+    std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops         = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec     = num_byte / 1.E6 / ave_time;
+
+    std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType = InDataType,
+          typename OutDataType = InDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+int run_grouped_conv_bwd_data_example_with_layouts(
+    int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout)
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using AccDataType = float;
+
+    std::vector<ck_tile::index_t> filter_spatial_lengths;
+    std::vector<ck_tile::index_t> image_spatial_lengths;
+    std::vector<ck_tile::index_t> strides;
+    std::vector<ck_tile::index_t> dilations;
+    std::vector<ck_tile::index_t> lpads;
+    std::vector<ck_tile::index_t> rpads;
+
+    const ck_tile::index_t num_dim_sp = fill_spatial_dimensions(filter_spatial_lengths,
+                                                                image_spatial_lengths,
+                                                                strides,
+                                                                dilations,
+                                                                lpads,
+                                                                rpads,
+                                                                arg_parser);
+
+    ck_tile::conv::ConvParam conv_param{num_dim_sp,
+                                        arg_parser.get_int("g"),
+                                        arg_parser.get_int("n"),
+                                        arg_parser.get_int("k"),
+                                        arg_parser.get_int("c"),
+                                        filter_spatial_lengths,
+                                        image_spatial_lengths,
+                                        strides,
+                                        dilations,
+                                        lpads,
+                                        rpads};
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    const auto in_g_n_c_wis_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input(in_g_n_c_wis_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    ck_tile::HostTensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<WeiDataType>{-1.f, 1.f}(weight);
+        ck_tile::FillUniformDistribution<OutDataType>{-1.f, 1.f}(output);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<WeiDataType>{}(weight);
+        ck_tile::FillMonotonicSeq<OutDataType>{}(output);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<WeiDataType>{1.f, 1.f}(weight);
+        ck_tile::FillUniformDistribution<OutDataType>{1.f, 1.f}(output);
+    }
+    else
+    {
+        weight.SetZero();
+        output.SetZero();
+    }
+
+    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+    input_dev_buf.SetZero();
+    weight_dev_buf.ToDevice(weight.data());
+    output_dev_buf.ToDevice(output.data());
+
+    ck_tile::GroupedConvBwdDataHostArgs args(conv_param,
+                                               input_dev_buf.GetDeviceBuffer(),
+                                               weight_dev_buf.GetDeviceBuffer(),
+                                               {},
+                                               output_dev_buf.GetDeviceBuffer(),
+                                               kbatch);
+
+    std::cout << "Run Grouped Conv Bwd Data kernel" << std::endl;
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    invoke_grouped_conv_bwd_data<NDimSpatial,
+                                   InDataType,
+                                   WeiDataType,
+                                   AccDataType,
+                                   OutDataType,
+                                   InLayout,
+                                   WeiLayout,
+                                   OutLayout>(args, n_warmup, n_repeat);
+
+    input_dev_buf.FromDevice(input.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<InDataType> input_host_ref(in_g_n_c_wis_desc);
+        input_host_ref.SetZero();
+
+        ck_tile::
+            reference_grouped_conv_bwd_data<NDimSpatial, InDataType, WeiDataType, OutDataType>(
+                input_host_ref,
+                weight,
+                output,
+                conv_param.conv_filter_strides_,
+                conv_param.conv_filter_dilations_,
+                conv_param.input_left_pads_,
+                conv_param.input_right_pads_);
+        const ck_tile::index_t GemmK =
+            weight.get_element_size() / (conv_param.G_ * conv_param.K_);
+        const float max_accumulated_value =
+            *std::max_element(input_host_ref.mData.begin(), input_host_ref.mData.end());
+        const auto rtol_atol =
+            calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
+                GemmK, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(input,
+                                  input_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        throw std::runtime_error("Unsupported gpu verification !!!");
+    }
+
+    return pass;
+}
diff --git a/example/ck_tile/21_elementwise/elementwise_example.cpp b/example/ck_tile/21_elementwise/elementwise_example.cpp
index 4c501860fd..2cc539e117 100644
--- a/example/ck_tile/21_elementwise/elementwise_example.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example.cpp
@@ -113,7 +113,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     // ElementWiseShape bundles these tiling parameters.
     // It calculates derived properties like threads per wavefront, repeats, vectorization and total
     // block size.
-    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
 
     // ElementWisePipelineProblem encapsulates all necessary information for the elementwise kernel:
     // - Data types (input, compute, output).
@@ -167,17 +167,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }
 
     // 4. Run the kernel
-    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
-                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                       Kernel{},
-                                       kGridSize,
-                                       kBlockSize,
-                                       0,
-                                       input_size,
-                                       ck_tile::make_tuple(N, 1), // Input Stride
-                                       ck_tile::make_tuple(N, 1), // Output Stride
-                                       input_tensors,
-                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        ck_tile::make_kernel<kBlockPerCu>(Kernel{},
+                                          kGridSize,
+                                          kBlockSize,
+                                          0,
+                                          input_size,
+                                          ck_tile::make_tuple(N, 1), // Input Stride
+                                          ck_tile::make_tuple(N, 1), // Output Stride
+                                          input_tensors,
+                                          static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
 
     std::cout << "Average time: " << ave_time << " ms" << std::endl;
 
diff --git a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
index f18a910813..7087d092a2 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
@@ -69,7 +69,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using BlockWarps = ck_tile::sequence<1>;
     using WarpTile   = ck_tile::sequence<256>;
 
-    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
 
     using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
                                                         ComputeDataType,
@@ -113,7 +113,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     // Run the kernel
     float ave_time = launch_kernel(
         ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
-        ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+        ck_tile::make_kernel<kBlockPerCu>(
             Kernel{},
             kGridSize,
             kBlockSize,
diff --git a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
index affc337c38..28cdaf27b9 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
@@ -73,7 +73,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using BlockWarps = ck_tile::sequence<8>;
     using WarpTile   = ck_tile::sequence<64>;
 
-    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
 
     // Problem definition for a single input tensor
     using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
@@ -86,7 +86,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::index_t total_elements = M * N;
 
-    constexpr ck_tile::index_t kBlockSize         = 64 * BlockWarps::at(ck_tile::number<0>{});
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
     constexpr ck_tile::index_t kBlockPerCu        = 1;
     constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
     ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
@@ -111,17 +112,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }
 
     // 4. Run the kernel
-    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
-                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                       Kernel{},
-                                       kGridSize,
-                                       kBlockSize,
-                                       0,             // Shared memory
-                                       op_lengths,    // Logical dimensions for the operation (M, N)
-                                       input_strides, // Strides for input tensor(s)
-                                       output_strides, // Strides for output tensor (N, M)
-                                       input_tensors,
-                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        ck_tile::make_kernel<kBlockPerCu>(Kernel{},
+                                          kGridSize,
+                                          kBlockSize,
+                                          0,          // Shared memory
+                                          op_lengths, // Logical dimensions for the operation (M, N)
+                                          input_strides,  // Strides for input tensor(s)
+                                          output_strides, // Strides for output tensor (N, M)
+                                          input_tensors,
+                                          static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
 
     std::cout << "Average time: " << ave_time << " ms" << std::endl;
 
diff --git a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
index 147dfd3424..782d3da24d 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
@@ -38,7 +38,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using XDataType             = DataType;
     using YDataType             = DataType;
-    using ComputeDataType       = float;
     using XElementwiseOperation = ck_tile::element_wise::UnarySquare;
 
     // 1. Initialize the input data on the host
@@ -64,7 +63,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                              // will cover some part of blockTile)
     using WarpTile = ck_tile::sequence<64>;  // How many elements are covered by a warp
 
-    using Shape   = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+    using Shape   = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
     using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
                                                         XDataType, // ComputeDataType is same as
                                                                    // XDataType in the unary case
@@ -100,17 +99,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }
 
     // 4. Run the kernel
-    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
-                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                       Kernel{},
-                                       kGridSize,
-                                       kBlockSize,
-                                       0,
-                                       input_size,
-                                       ck_tile::make_tuple(N, 1), // Input Stride
-                                       ck_tile::make_tuple(N, 1), // Output Stride
-                                       input_tensors,
-                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        ck_tile::make_kernel<kBlockPerCu>(Kernel{},
+                                          kGridSize,
+                                          kBlockSize,
+                                          0,
+                                          input_size,
+                                          ck_tile::make_tuple(N, 1), // Input Stride
+                                          ck_tile::make_tuple(N, 1), // Output Stride
+                                          input_tensors,
+                                          static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
 
     std::cout << "Average time: " << ave_time << " ms" << std::endl;
 
diff --git a/example/ck_tile/35_batched_transpose/README.md b/example/ck_tile/35_batched_transpose/README.md
index 38bb2b32e4..56e9610b35 100644
--- a/example/ck_tile/35_batched_transpose/README.md
+++ b/example/ck_tile/35_batched_transpose/README.md
@@ -6,7 +6,7 @@ This folder contains example for batched Transpose using ck_tile tile-programmin
 # in the root of ck_tile
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 # Make the transpose executable
 make tile_example_batched_transpose -j
 ```
diff --git a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
index 1f0f0b9bc1..931a9dfa3c 100644
--- a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
@@ -74,8 +74,8 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
 
     auto kargs = kernel::MakeKargs(a);
 
-    const dim3 grids      = kernel::GridSize(a);
-    constexpr dim3 blocks = kernel::BlockSize();
+    const dim3 grids  = kernel::GridSize(a);
+    const dim3 blocks = kernel::BlockSize();
 
     printf("Pipeline: %d\n", Config::kPipelineId);
     printf("Grid: x=%u y=%u z=%u\n", grids.x, grids.y, grids.z);
@@ -96,8 +96,8 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
 
     printf("Launching Kernel...\n");
 
-    float ave_time = ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
+    float ave_time =
+        ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(kernel{}, grids, blocks, 0, kargs));
 
     printf("Kernel finished...\n");
 
diff --git a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
index bdcb6f50bd..914fdac0e4 100644
--- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
+++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
@@ -8,6 +8,9 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion
 if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
     add_executable(tile_example_gemm_aquant_basic EXCLUDE_FROM_ALL gemm_aquant_basic.cpp)
     target_compile_options(tile_example_gemm_aquant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+    add_executable(tile_example_gemm_aquant_preshuffle EXCLUDE_FROM_ALL gemm_aquant_preshuffle.cpp)
+    target_compile_options(tile_example_gemm_aquant_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 else()
     message(DEBUG "Skipping ck_tile quant gemm tests for current target")
 endif()
diff --git a/example/ck_tile/38_block_scale_gemm/README.md b/example/ck_tile/38_block_scale_gemm/README.md
index 742a88dee7..fc905790f1 100644
--- a/example/ck_tile/38_block_scale_gemm/README.md
+++ b/example/ck_tile/38_block_scale_gemm/README.md
@@ -7,7 +7,7 @@ This folder contains example for Block Scale GEMM using ck_tile tile-programming
 # in the root of ck_tile
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 # The aquant pipeline method on the gemm calculation
 make tile_example_gemm_aquant_basic -j
 ```
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
index 2667cae788..2ea8530cb2 100644
--- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
@@ -21,7 +21,8 @@ template <typename ADataType,
           typename ALayout,
           typename BLayout,
           typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
 {
     constexpr bool kPadM = false;
@@ -52,7 +53,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
     using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
 
     using CodegenGemmTraits =
-        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, Preshuffle, ALayout, BLayout, CLayout>;
 
     using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
                                                                  BDataType,
@@ -95,7 +96,6 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
                                                     ck_tile::tuple<>,
                                                     CLayout,
                                                     ck_tile::element_wise::PassThrough,
-                                                    CodegenPipelineProblem::kBlockSize,
                                                     TilePartitioner::MPerBlock,
                                                     TilePartitioner::NPerBlock,
                                                     M_Warp,
@@ -110,8 +110,8 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
 
         auto kargs = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = Kernel::BlockSize();
 
         if(args.k_batch != 1)
         {
@@ -135,7 +135,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
         }
 
         float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     };
@@ -144,7 +144,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
 
 #include "run_gemm_aquant_example.inc"
 
-template <typename TypeConfig, uint32_t QuantGroupSize>
+template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
     using Row = ck_tile::tensor_layout::gemm::RowMajor;
@@ -156,7 +156,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     {
         if(a_layout == "R" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<TypeConfig, QuantGroupSize>(
+            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
                 argc, argv, Row{}, Row{}, Col{}, Row{});
         }
         else
@@ -172,6 +172,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     return 0;
 }
 
+template <template <typename PreType> typename GemmConfig>
 int run_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -186,12 +187,14 @@ int run_gemm_example(int argc, char* argv[])
     {
         using TypeConfig =
             decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "bf8")
     {
         using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4fp8")
     {
@@ -199,7 +202,8 @@ int run_gemm_example(int argc, char* argv[])
                                                         ck_tile::fp8_t,
                                                         float,
                                                         ck_tile::fp8_t>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4bf8")
     {
@@ -207,19 +211,22 @@ int run_gemm_example(int argc, char* argv[])
                                                         ck_tile::bf8_t,
                                                         float,
                                                         ck_tile::bf8_t>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4f32fp8")
     {
         using TypeConfig =
             decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4f32bf8")
     {
         using TypeConfig =
             decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else
     {
@@ -227,4 +234,4 @@ int run_gemm_example(int argc, char* argv[])
     }
 }
 
-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+int main(int argc, char* argv[]) { return !run_gemm_example<GemmConfigComputeV3>(argc, argv); }
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp
new file mode 100644
index 0000000000..4adc3df94b
--- /dev/null
+++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_utils.hpp"
+
+template <typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ComputeDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
+float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
+{
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr int kBlockPerCu = 1;
+
+    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+
+    constexpr ck_tile::index_t M_Tile = 16;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 256;
+
+    constexpr ck_tile::index_t M_Warp = 1;
+    constexpr ck_tile::index_t N_Warp = 4;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 16;
+    constexpr ck_tile::index_t N_Warp_Tile = 16;
+    constexpr ck_tile::index_t K_Warp_Tile = 32;
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, Preshuffle, ALayout, BLayout, CLayout>;
+
+    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
+                                                                 BDataType,
+                                                                 AccDataType,
+                                                                 CodegenGemmShape,
+                                                                 CodegenGemmTraits,
+                                                                 ComputeDataType>;
+
+    using BaseGemmPipeline = ck_tile::BaseAQuantGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+    const ck_tile::index_t K_split      = (args.K + K_Tile - 1) / K_Tile * K_Tile;
+    const ck_tile::index_t num_loop     = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop             = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num  = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    constexpr bool transposed_warp_gemm = false;
+
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+
+        using CodegenPipelineProblem =
+            ck_tile::GemmAQuantPipelineProblem<ADataType,
+                                               AQDataType,
+                                               BDataType,
+                                               AccDataType,
+                                               CodegenGemmShape,
+                                               CodegenGemmTraits,
+                                               QuantGroupSize,
+                                               ComputeDataType,
+                                               ck_tile::GemmPipelineScheduler::Intrawave,
+                                               has_hot_loop_v,
+                                               tail_number_v>;
+        using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
+        using GemmEpilogue        = ck_tile::CShuffleEpilogue<
+                   ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                    BDataType,
+                                                    ck_tile::tuple<>,
+                                                    AccDataType,
+                                                    CDataType,
+                                                    ck_tile::tuple<>,
+                                                    CLayout,
+                                                    ck_tile::element_wise::PassThrough,
+                                                    TilePartitioner::MPerBlock,
+                                                    TilePartitioner::NPerBlock,
+                                                    M_Warp,
+                                                    N_Warp,
+                                                    M_Warp_Tile,
+                                                    N_Warp_Tile,
+                                                    K_Warp_Tile,
+                                                    transposed_warp_gemm,
+                                                    ck_tile::memory_operation_enum::set>>;
+        using Kernel =
+            ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+
+        auto kargs = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = Kernel::BlockSize();
+
+        if(args.k_batch != 1)
+        {
+            throw std::runtime_error("split-k is not supported yet!");
+        }
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenGemmShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+}
+
+#include "run_gemm_aquant_example.inc"
+
+template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
+                argc, argv, Row{}, Row{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for A.");
+    }
+
+    return 0;
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4fp8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::fp8_t,
+                                                        float,
+                                                        ck_tile::fp8_t>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4bf8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::bf8_t,
+                                                        float,
+                                                        ck_tile::bf8_t>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32bf8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example<GemmConfigPreshufle_AQ>(argc, argv); }
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
index 35e80ddb89..0d0da93133 100644
--- a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
@@ -35,7 +35,7 @@ constexpr ck_tile::index_t get_k_warp_tile()
 #endif
 }
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
-constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+constexpr ck_tile::index_t get_k_from_preshuffled_warp_tile()
 {
 #if defined(__gfx950__)
     if constexpr(M_Warp_Tile == 32)
@@ -138,7 +138,7 @@ struct GemmConfigComputeV3 : public GemmConfigBase
     // Compute V3 only support Intrawave scheduler
     static constexpr ck_tile::index_t M_Tile = 32;
     static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
 
     static constexpr ck_tile::index_t M_Warp = 1;
     static constexpr ck_tile::index_t N_Warp = 4;
@@ -265,7 +265,8 @@ struct GemmConfigPreshufle_1 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 32;
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 2;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
@@ -287,7 +288,8 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 2;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
@@ -296,62 +298,25 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase
     static constexpr bool DoubleSmemBuffer     = false;
 };
 
-template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
-struct GemmTypeConfig;
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t>
+template <typename PrecType>
+struct GemmConfigPreshufle_AQ : public GemmConfigBase
 {
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::half_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-    // ToDo: Add more bias config to support different categories of GEMM.
-};
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
 
-template <>
-struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
-{
-    using ADataType   = ck_tile::bf16_t;
-    using BDataType   = ck_tile::bf16_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::bf16_t;
-};
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
 
-template <>
-struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::fp8_t;
-    using BDataType   = ck_tile::fp8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();
 
-template <>
-struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::bf8_t;
-    using BDataType   = ck_tile::bf8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
-{
-    using ADataType   = ck_tile::int8_t;
-    using BDataType   = ck_tile::int8_t;
-    using AccDataType = int32_t;
-    using CDataType   = int32_t;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
 };
 
 template <typename ADataType_,
@@ -424,7 +389,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float>
     using QDataType   = float;
     using BDataType   = ck_tile::fp8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -434,7 +399,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>
     using QDataType   = float;
     using BDataType   = ck_tile::bf8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -444,7 +409,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, ck_tile::f
     using QDataType   = ck_tile::fp8_t;
     using BDataType   = ck_tile::fp8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -454,7 +419,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float, ck_tile::fp8_t
     using QDataType   = ck_tile::fp8_t;
     using BDataType   = ck_tile::fp8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -464,7 +429,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float, ck_tile::bf8_t
     using QDataType   = ck_tile::bf8_t;
     using BDataType   = ck_tile::bf8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -474,7 +439,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, ck_tile::b
     using QDataType   = ck_tile::bf8_t;
     using BDataType   = ck_tile::bf8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -484,7 +449,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>
     using QDataType   = float;
     using BDataType   = ck_tile::fp8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -494,7 +459,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>
     using QDataType   = float;
     using BDataType   = ck_tile::bf8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -504,7 +469,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, ck_tile::f
     using QDataType   = ck_tile::fp8_t;
     using BDataType   = ck_tile::pk_int4_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -514,7 +479,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, ck_tile::b
     using QDataType   = ck_tile::bf8_t;
     using BDataType   = ck_tile::pk_int4_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -524,7 +489,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, float>
     using QDataType   = float;
     using BDataType   = ck_tile::pk_int4_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -534,7 +499,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, float>
     using QDataType   = float;
     using BDataType   = ck_tile::pk_int4_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <typename T>
@@ -660,7 +625,7 @@ auto create_args(int argc, char* argv[])
         .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
         .insert("prec", "i4fp8", "data type. fp8/bf8/i4fp8/i4bf8/i4f32fp8/i4f32bf8")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
-        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("repeat", "1000", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("split_k", "1", "splitK value")
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
diff --git a/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc b/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
index 9bdef9755b..6b5e01ca4c 100644
--- a/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
+++ b/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
@@ -4,6 +4,7 @@
 #pragma once
 #include <bit>
 #include <random>
+#include <stdexcept>
 
 template <typename Layout>
 static constexpr inline auto is_row_major(Layout layout_)
@@ -12,6 +13,24 @@ static constexpr inline auto is_row_major(Layout layout_)
                                                  ck_tile::tensor_layout::gemm::RowMajor>>{};
 }
 
+template <typename T>
+auto shuffle_aq(const ck_tile::HostTensor<T>& t, int block_aq_k)
+{
+    if(t.get_lengths().size() != 2)
+    {
+        throw std::runtime_error("Host tensor is not rank 2 tensor.");
+    }
+    int m_   = t.get_lengths()[0];
+    int aqk_ = t.get_lengths()[1];
+    if(aqk_ % block_aq_k != 0)
+    {
+        throw std::runtime_error("shuffle_aq needs a aqk of multiple times of block_aq_k.");
+    }
+    ck_tile::HostTensor<T> t_view({m_, aqk_ / block_aq_k, block_aq_k});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {1, 0, 2});
+}
+
 template <typename ADataType,
           typename AQDataType,
           typename BDataType,
@@ -21,7 +40,8 @@ template <typename ADataType,
           typename AQLayout,
           typename BLayout,
           typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   ck_tile::DeviceMem& aq_m_aqk_dev_buf,
                   ck_tile::DeviceMem& b_k_n_dev_buf,
@@ -62,7 +82,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                                       ALayout,
                                       BLayout,
                                       CLayout,
-                                      QuantGroupSize>(
+                                      QuantGroupSize,
+                                      Preshuffle>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop     = std::size_t(2) * M * N * K;
@@ -85,7 +106,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     return ave_time;
 }
 
-template <typename TypeConfig,
+template <typename GemmConfig,
+          typename TypeConfig,
           uint32_t QuantGroupSize,
           typename ALayout,
           typename AQLayout,
@@ -184,8 +206,18 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
     ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
 
+    if constexpr(GemmConfig::Preshuffle)
+    {
+        ck_tile::HostTensor<AQDataType> aq_shuffle_host =
+            shuffle_aq(aq_m_aqk, GemmConfig::K_Tile / QuantGroupSize);
+        aq_m_aqk_dev_buf.ToDevice(aq_shuffle_host.data());
+    }
+    else
+    {
+        aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
+    }
+
     a_m_k_dev_buf.ToDevice(a_m_k.data());
-    aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
     b_k_n_dev_buf.ToDevice(b_k_n.data());
     c_m_n_dev_buf.SetZero();
     c_m_n_dev_result.SetZero();
@@ -199,21 +231,22 @@ int run_gemm_example_with_layouts(int argc,
                 AQLayout,
                 BLayout,
                 CLayout,
-                QuantGroupSize>(a_m_k_dev_buf,
-                                aq_m_aqk_dev_buf,
-                                b_k_n_dev_buf,
-                                c_m_n_dev_buf,
-                                M,
-                                N,
-                                K,
-                                AQK,
-                                stride_A,
-                                stride_AQ,
-                                stride_B,
-                                stride_C,
-                                kbatch,
-                                n_warmup,
-                                n_repeat);
+                QuantGroupSize,
+                GemmConfig::Preshuffle>(a_m_k_dev_buf,
+                                        aq_m_aqk_dev_buf,
+                                        b_k_n_dev_buf,
+                                        c_m_n_dev_buf,
+                                        M,
+                                        N,
+                                        K,
+                                        AQK,
+                                        stride_A,
+                                        stride_AQ,
+                                        stride_B,
+                                        stride_C,
+                                        kbatch,
+                                        n_warmup,
+                                        n_repeat);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
diff --git a/example/ck_tile/39_copy/README.md b/example/ck_tile/39_copy/README.md
index f45fcb682b..b5bc5d56be 100644
--- a/example/ck_tile/39_copy/README.md
+++ b/example/ck_tile/39_copy/README.md
@@ -12,7 +12,7 @@ This experimental kernel is intended for novice CK developers. It introduces the
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture 
 # (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 # Make the copy kernel executable
 make tile_example_copy -j
 ```
@@ -38,14 +38,14 @@ The CK Tile framework is built around four key architectural components that wor
 Defines the **hierarchical tile structure** and **memory layout** of the kernel:
 
 ```cpp
-using Shape = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+using Shape = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, ThreadTile>;
 ```
 
 **Components:**
 - **BlockWaves**: Number of concurrent waves per block (e.g., `seq<4, 1>` for 4 waves along M, 1 along N)
 - **BlockTile**: Total elements processed by one block (e.g., `seq<512, 8>`)
 - **WaveTile**: Elements processed by one wave (e.g., `seq<32, 8>`)
-- **Vector**: Elements processed by one thread (e.g., `seq<1, 4>` for 4 contiguous elements)
+- **ThreadTile**: Elements processed by one thread (e.g., `seq<1, 4>` for 4 contiguous elements)
 
 **Purpose**: Defines the **work distribution hierarchy** from threads → waves → blocks.
 
@@ -91,7 +91,7 @@ Defines the **execution flow** and **memory movement patterns**:
 
 ```cpp
 // Complete kernel definition
-using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, ThreadTile>;
 using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
 using Policy  = ck_tile::TileCopyPolicy<Problem>;
 using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>;
@@ -113,7 +113,7 @@ using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>;
 
 #### **Reusability**
 - Same **Shape** can be used with different **Problems**
-- Same **Policy** can be applied to different **Shapes**
+- Same **Policy** can be applied to different **Problems**
 - **Pipelines** can be reused across different kernels
 
 #### **Performance Optimization**
@@ -127,16 +127,16 @@ using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>;
 
 The CK Tile framework organizes work in a hierarchical manner:
 
-1. **Vector**: Number of contiguous elements processed by a single thread
+1. **ThreadTile**: Number of contiguous elements processed by a single thread
    - Enables vectorized memory loads/stores.
-   - Example: `Vector = seq<1, 4>` means each thread loads 4 contiguous elements along the N dimension
-   - A Vector can be imagined as a thread-level tile
+   - Example: `ThreadTile = seq<1, 4>` means each thread loads 4 contiguous elements along the N dimension
+   - A ThreadTile can be imagined as a thread-level tile
 
-2. **WaveTile**: Number of elements covered by a single wave (64 threads on AMD)
-   - Must satisfy: `Wave_Tile_M / Vector_M * Wave_Tile_N / Vector_N == WaveSize`
+2. **WaveTile**: Number of elements covered by a single wave (64 threads on CDNA, 32 threads on RDNA)
+   - Must satisfy: `Wave_Tile_M / ThreadTile_M * Wave_Tile_N / ThreadTile_N == WaveSize`
    - This ensures the number of threads needed equals the wave size
-   - Example: `WaveTile = seq<64, 4>` with `Vector = seq<1, 4>` means:
-     - Each thread handles 4 elements (Vector_N = 4)
+   - Example: `WaveTile = seq<64, 4>` with `ThreadTile = seq<1, 4>` means:
+     - Each thread handles 4 elements (ThreadTile_N = 4)
      - Wave needs 64×4/4 = 64 threads to cover 64×4 = 256 elements
      - Total elements = 256, which requires WaveSize = 64 threads
 
@@ -144,8 +144,9 @@ The CK Tile framework organizes work in a hierarchical manner:
    - Example: `BlockTile = seq<256, 64>` means each block processes 256×64 elements
 
 4. **BlockWaves**: Number of concurrent waves active in a block
-   - Usually 4 waves per block on modern AMD GPUs
-   - Example: `BlockWaves = seq<4, 1>` means 4 waves along M dimension, 1 along N
+   - Typical: 4 waves for heavy workloads (e.g., GEMM)
+   - Limit: up to 1024 threads per block → up to 16 waves (CDNA) or 32 waves (RDNA)
+   - Example: `BlockWaves = seq<4, 1>` means 4 waves along M, 1 along N
 
 ### Wave Repetition
 
@@ -159,7 +160,7 @@ static constexpr index_t WaveRepetitionPerBlock_N =
     Block_Tile_N / (Waves_Per_Block_N * Wave_Tile_N);
 ```
 
-**Key Insight**: When waves repeat, the effective work per thread becomes `Vector * Repeat`, not just `Vector`.
+**Key Insight**: When waves repeat, the effective work per thread becomes `ThreadTile * Repeat`, not just `ThreadTile`.
 
 ## Tile Distribution Encoding
 
@@ -183,8 +184,9 @@ constexpr auto outer_encoding =
   - M2: Number of threads per wave along M
 - **N0, N1**: Distribution along N dimension
   - N0: Number of threads along N
-  - N1: Vector size (elements per thread)
-- **YIELD arguments**: Both `Repeat` and `Vector` because effective work per thread is `Vector * Repeat`
+  - N1: ThreadTile size (elements per thread)
+- **Order and layout**: The inner-most (rightmost) dimension is the fastest-changing. Choosing `N1 = ThreadTile_N` maps vector width to contiguous addresses, i.e., row-major access in this example.
+- **YIELD arguments**: Both `Repeat` and `ThreadTile` because effective work per thread is `ThreadTile * Repeat`
 
 ## Tensor Abstractions
 
@@ -194,7 +196,7 @@ Defines the logical structure of a tensor:
 auto desc = make_naive_tensor_descriptor(
     make_tuple(M, N),           // tensor dimensions
     make_tuple(N, 1),           // strides
-    number<Vector_N>{},         // vector length for vectorized access
+    number<ThreadTile_N>{},     // per-thread vector length
     number<1>{}                 // guaranteed last dimension vector stride
 );
 ```
@@ -206,7 +208,7 @@ auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
     p_x,                        // memory buffer
     make_tuple(M, N),           // dimensions
     make_tuple(N, 1),           // strides  
-    number<S::Vector_N>{},      // vector length
+    number<S::ThreadTile_N>{},  // per-thread vector length
     number<1>{}                 // guaranteed last dimension vector stride
 );
 ```
@@ -247,10 +249,10 @@ struct TileCopyKernel
 1. **Tensor View Creation**:
    ```cpp
    const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-       p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+       p_x, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
    ```
    - Creates views for both input and output tensors
-   - Specifies vectorized access with `Vector_N` elements per load
+   - Specifies vectorized access with `ThreadTile_N` elements per load
 
 2. **Tile Window Creation**:
    ```cpp
diff --git a/example/ck_tile/39_copy/copy_basic.cpp b/example/ck_tile/39_copy/copy_basic.cpp
index d46add879c..3f36d7f4f0 100644
--- a/example/ck_tile/39_copy/copy_basic.cpp
+++ b/example/ck_tile/39_copy/copy_basic.cpp
@@ -54,7 +54,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     x_buf.ToDevice(x_host.data());
 
     // Define tile configuration
-    using Vector     = ck_tile::sequence<1, 4>;   // vector size along M and N dimension
+    using ThreadTile = ck_tile::sequence<1, 4>;   // per-thread tile size along M and N
     using WaveTile   = ck_tile::sequence<64, 4>;  // wave size along M and N dimension
     using BlockWaves = ck_tile::sequence<4, 1>;   // number of waves along M dimension
     using BlockTile  = ck_tile::sequence<512, 4>; // block size along M and N dimension
@@ -65,7 +65,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     std::cout << "grid size (number of blocks per grid) " << kGridSize << std::endl;
 
     // Define kernel types
-    using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+    using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, ThreadTile>;
     using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
     using Policy  = ck_tile::TileCopyPolicy<Problem>;
     using Kernel  = ck_tile::ElementWiseTileCopyKernel<Problem, Policy>;
@@ -88,8 +88,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
               << " " << BlockTile::at(ck_tile::number<1>{}) << std::endl;
     std::cout << "wave tile (number of elements per wave) " << WaveTile::at(ck_tile::number<0>{})
               << " " << WaveTile::at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "vector (number of elements per thread) " << Vector::at(ck_tile::number<0>{})
-              << " " << Vector::at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "thread tile (number of elements per thread) "
+              << ThreadTile::at(ck_tile::number<0>{}) << " " << ThreadTile::at(ck_tile::number<1>{})
+              << std::endl;
     std::cout << "WaveRepetitionPerBlock_M =  " << Shape::WaveRepetitionPerBlock_M << " --> ("
               << Shape::Block_Tile_M << "/" << Shape::Waves_Per_Block_M << "*" << Shape::Wave_Tile_M
               << ")" << std::endl;
@@ -98,16 +99,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
               << ")" << std::endl;
 
     // Launch kernel
-    float ave_time = launch_kernel(
-        ck_tile::stream_config{nullptr, true, warmup, repeat, 1},
-        ck_tile::make_kernel<kBlockSize, 1>(Kernel{},
-                                            kGridSize,
-                                            kBlockSize,
-                                            0,
-                                            static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
-                                            static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
-                                            m,
-                                            n));
+    float ave_time =
+        launch_kernel(ck_tile::stream_config{nullptr, true, warmup, repeat, 1},
+                      ck_tile::make_kernel<1>(Kernel{},
+                                              kGridSize,
+                                              kBlockSize,
+                                              0,
+                                              static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                              static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                              m,
+                                              n));
 
     // Calculate and print performance metrics
     std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m * n;
diff --git a/example/ck_tile/39_copy/copy_basic.hpp b/example/ck_tile/39_copy/copy_basic.hpp
index bbeb964fda..1a313e1353 100644
--- a/example/ck_tile/39_copy/copy_basic.hpp
+++ b/example/ck_tile/39_copy/copy_basic.hpp
@@ -17,14 +17,14 @@ namespace ck_tile {
  * @tparam BlockWaves Number of waves along seq<M, N>
  * @tparam BlockTile Block size, seq<M, N>
  * @tparam WaveTile Wave size, seq<M, N>
- * @tparam Vector Contiguous elements (vector size) along seq<M, N>
+ * @tparam ThreadTile Contiguous elements per thread along seq<M, N>
  */
-template <typename BlockWaves, typename BlockTile, typename WaveTile, typename Vector>
+template <typename BlockWaves, typename BlockTile, typename WaveTile, typename ThreadTile>
 struct TileCopyShape
 {
-    // Vector dimensions for memory operations
-    static constexpr index_t Vector_M = Vector::at(number<0>{});
-    static constexpr index_t Vector_N = Vector::at(number<1>{});
+    // ThreadTile dimensions for memory operations
+    static constexpr index_t ThreadTile_M = ThreadTile::at(number<0>{});
+    static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{});
 
     // Wave tile dimensions
     static constexpr index_t Wave_Tile_M = WaveTile::at(number<0>{});
@@ -51,7 +51,7 @@ struct TileCopyShape
     // Configuration validation
     static_assert(Block_Tile_M > 0 && Block_Tile_N > 0, "Block tile dimensions must be positive");
     static_assert(Wave_Tile_M > 0 && Wave_Tile_N > 0, "Wave tile dimensions must be positive");
-    static_assert(Vector_M > 0 && Vector_N > 0, "Vector dimensions must be positive");
+    static_assert(ThreadTile_M > 0 && ThreadTile_N > 0, "ThreadTile dimensions must be positive");
     static_assert(Waves_Per_Block_M > 0 && Waves_Per_Block_N > 0,
                   "Waves per block must be positive");
     static_assert(Waves_Per_Block_M * Wave_Tile_M > 0,
@@ -60,8 +60,8 @@ struct TileCopyShape
                   "Invalid wave configuration for N dimension");
 
     // Ensure wave tile dimensions align with wave size
-    static_assert(Wave_Tile_M / Vector_M * Wave_Tile_N / Vector_N == WaveSize,
-                  "(Wave_Tile_M/Vector_M) * (Wave_Tile_N/Vector_N) != WaveSize");
+    static_assert(Wave_Tile_M / ThreadTile_M * Wave_Tile_N / ThreadTile_N == WaveSize,
+                  "(Wave_Tile_M/ThreadTile_M) * (Wave_Tile_N/ThreadTile_N) != WaveSize");
 };
 
 /**
@@ -95,7 +95,7 @@ struct TileCopyPolicy
         constexpr index_t block_size = S::BlockSize;
 
         // Distribution calculation to ensure all threads participate
-        constexpr index_t N1 = S::Vector_N;          // Elements per thread along N
+        constexpr index_t N1 = S::ThreadTile_N;      // Elements per thread along N
         constexpr index_t N0 = S::Block_Tile_N / N1; // Threads needed along N
 
         constexpr index_t M2 = wave_size / N0;              // Threads per wave along M
@@ -143,23 +143,21 @@ struct TileCopyKernel
 
         // Create tensor views for input and output
         const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         // Create tile windows with DRAM distribution
-        auto x_window =
-            make_tile_window(x_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
-        auto y_window =
-            make_tile_window(y_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto y_window = make_tile_window(y_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
         // Calculate iterations needed to cover N dimension
         // Note: This kernel uses data parallelism only in the M dimension.
@@ -218,23 +216,21 @@ struct ElementWiseTileCopyKernel
 
         // Create tensor views for input and output
         const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         // Create tile windows with DRAM distribution
-        auto x_window =
-            make_tile_window(x_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
-        auto y_window =
-            make_tile_window(y_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto y_window = make_tile_window(y_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
         // Calculate iterations needed to cover N dimension
         // Note: This kernel uses data parallelism only in the M dimension.
@@ -297,45 +293,41 @@ struct TileCopyKernel_LDS
         }
 
         // LDS buffer allocation
-        __shared__ XDataType x_lds_buffer[S::Block_Tile_M * S::Block_Tile_N];
+        __shared__ XDataType x_lds_buffer[S::Block_Tile_Mmake * S::Block_Tile_N];
 
         // LDS tensor descriptor and view
         const auto x_lds_descriptor =
             make_naive_tensor_descriptor(make_tuple(S::Block_Tile_M, S::Block_Tile_N),
                                          make_tuple(S::Block_Tile_N, 1),
-                                         number<S::Vector_N>{},
+                                         number<S::ThreadTile_N>{},
                                          number<1>{});
 
         auto x_lds_view = make_tensor_view<address_space_enum::lds>(x_lds_buffer, x_lds_descriptor);
 
         // LDS windows with different distributions for optimal access patterns
-        auto x_lds_write_window = make_tile_window(
-            x_lds_view, make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}), {0, 0});
+        auto x_lds_write_window =
+            make_tile_window(x_lds_view, make_tuple(S::Block_Tile_M, S::Block_Tile_N), {0, 0});
 
-        auto x_lds_read_window =
-            make_tile_window(x_lds_view,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {0, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto x_lds_read_window = make_tile_window(x_lds_view,
+                                                  make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                                  {0, 0},
+                                                  Policy::template MakeDRAMDistribution<Problem>());
 
         // Global memory tensor views
         const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         // Global memory tile windows
-        auto x_window =
-            make_tile_window(x_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
-        auto y_window =
-            make_tile_window(y_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0});
+        auto y_window = make_tile_window(
+            y_m_n, make_tuple(S::Block_Tile_M, S::Block_Tile_N), {tile_block_origin_m, 0});
 
         // Calculate iterations needed to cover N dimension
         // Note: This kernel uses data parallelism only in the M dimension.
diff --git a/example/ck_tile/39_copy/test_tile_example.sh b/example/ck_tile/39_copy/test_tile_example.sh
new file mode 100755
index 0000000000..fcd8c8e991
--- /dev/null
+++ b/example/ck_tile/39_copy/test_tile_example.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+BIN="${BIN:-../../../build/bin/tile_example_copy}"
+WARMUP="${WARMUP:-20}"
+REPEAT="${REPEAT:-100}"
+VALIDATE="${VALIDATE:-1}"
+
+MS=(128 256 512 1024)
+NS=(64 256 1024 2048 4096)
+PRECS=(fp16 fp32)
+
+echo "Using BIN=$BIN"
+echo "WARMUP=$WARMUP REPEAT=$REPEAT VALIDATE=$VALIDATE"
+
+failures=0
+
+for prec in "${PRECS[@]}"; do
+  for m in "${MS[@]}"; do
+    for n in "${NS[@]}"; do
+      echo "=============================================="
+      echo "Running: prec=$prec m=$m n=$n"
+      set +e
+      out="$("$BIN" -prec="$prec" -m="$m" -n="$n" -warmup="$WARMUP" -repeat="$REPEAT" -v="$VALIDATE" 2>&1)"
+      rc=$?
+      set -e
+
+      echo "$out"
+      if [[ $rc -ne 0 ]]; then
+        echo "RUN ERROR (rc=$rc) for m=$m n=$n prec=$prec"
+        ((failures++)) || true
+        continue
+      fi
+
+      if [[ "$VALIDATE" == "1" ]]; then
+        if ! grep -q "valid:y" <<<"$out"; then
+          echo "VALIDATION FAILED for m=$m n=$n prec=$prec"
+          ((failures++)) || true
+        fi
+      fi
+    done
+  done
+done
+
+echo "=============================================="
+if [[ $failures -eq 0 ]]; then
+  echo "All runs passed"
+else
+  echo "$failures runs failed"
+fi
\ No newline at end of file
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 794c6f4e20..09801203ba 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -222,9 +222,6 @@
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
 #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
 
-// workaround: conv crash when K, C is even
-#define CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN 1
-
 // workaround: compiler crash when compiling recursive lambda
 #define CK_WORKAROUND_SWDEV_275126 1
 
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index 5439bbe1f0..2bc5a4414e 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -52,10 +52,27 @@ inline std::string get_device_name()
     }
 }
 
+inline bool is_gfx12_supported()
+{
+    return ck::get_device_name() == "gfx1200" || ck::get_device_name() == "gfx1201";
+}
+
+inline bool is_gfx11_supported()
+{
+    return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+           ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103" ||
+           ck::get_device_name() == "gfx1150" || ck::get_device_name() == "gfx1151" ||
+           ck::get_device_name() == "gfx1152";
+}
+
 inline bool is_xdl_supported()
 {
     return ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
-           ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950";
+           ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"
+#if defined(CK_ENABLE_DYNAMIC_WARP_SIZE)
+           || is_gfx12_supported() || is_gfx11_supported()
+#endif
+        ;
 }
 
 inline bool is_lds_direct_load_supported()
@@ -67,7 +84,8 @@ inline bool is_lds_direct_load_supported()
 
 inline bool is_bf16_atomic_supported()
 {
-    return ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950";
+    return ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+           is_gfx12_supported();
 }
 
 inline bool is_gfx101_supported()
@@ -83,18 +101,5 @@ inline bool is_gfx103_supported()
            ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036";
 }
 
-inline bool is_gfx11_supported()
-{
-    return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
-           ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103" ||
-           ck::get_device_name() == "gfx1150" || ck::get_device_name() == "gfx1151" ||
-           ck::get_device_name() == "gfx1152";
-}
-
-inline bool is_gfx12_supported()
-{
-    return ck::get_device_name() == "gfx1200" || ck::get_device_name() == "gfx1201";
-}
-
 } // namespace ck
 #endif
diff --git a/include/ck/library/utility/validation_common.hpp b/include/ck/library/utility/validation_common.hpp
new file mode 100644
index 0000000000..38933c6d7c
--- /dev/null
+++ b/include/ck/library/utility/validation_common.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include "ck/ck.hpp"
+#include "ck/utility/type.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+namespace ck {
+namespace utils {
+
+template <typename Layout>
+inline void
+validate_gemm_stride(int M, int N, int stride, const std::string& stride_name = "Stride")
+{
+    if(ck::is_same_v<Layout, ck::tensor_layout::gemm::ColumnMajor>)
+    {
+        if(stride < M)
+        {
+            throw std::runtime_error(
+                "Error: For ColumnMajor layout, " + stride_name + " (" + std::to_string(stride) +
+                ") must be greater than or equal to dim (" + std::to_string(M) + ")");
+        }
+    }
+    else // RowMajor
+    {
+        if(stride < N)
+        {
+            throw std::runtime_error(
+                "Error: For RowMajor layout, " + stride_name + " (" + std::to_string(stride) +
+                ") must be greater than or equal to dim (" + std::to_string(N) + ")");
+        }
+    }
+}
+
+// Convenience functions for common GEMM patterns
+template <typename ALayout, typename BLayout, typename CLayout>
+inline void validate_gemm_strides_abc(int M, int N, int K, int StrideA, int StrideB, int StrideC)
+{
+    validate_gemm_stride<ALayout>(M, K, StrideA, "StrideA");
+    validate_gemm_stride<BLayout>(K, N, StrideB, "StrideB");
+    validate_gemm_stride<CLayout>(M, N, StrideC, "StrideC");
+}
+
+} // namespace utils
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
index cd13dbb836..acd1d2ae49 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -41,7 +41,9 @@ struct BlockwiseGemmXdlops_pipeline_base
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
     // Hardcode to 64, as HIP-provided "WarpSize" would return 32 on RDNA GPUs.
-    static constexpr index_t WaveSize = 64;
+    static constexpr index_t MWaves   = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves   = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t WaveSize = BlockSize / MWaves / NWaves;
 
     static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
     static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
@@ -74,9 +76,6 @@ struct BlockwiseGemmXdlops_pipeline_base
             return 1;
     }();
 
-    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
-    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
-
     using HotLoopInstList =
         ck::BlockwiseGemmXdlops_pipeline_hotloop_inst<BlockSize,
                                                       MPerBlock,
@@ -219,6 +218,7 @@ struct BlockwiseGemmXdlops_pipeline_base
                                       Tuple4 b_origin = CalculateBThreadOriginDataIndex())
         : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
     {
+#if defined(__HIP_DEVICE_COMPILE__)
         static_assert(AMmaTileDesc::IsKnownAtCompileTime() && BMmaTileDesc::IsKnownAtCompileTime(),
                       "wrong! Desc should be known at compile-time");
 
@@ -227,6 +227,7 @@ struct BlockwiseGemmXdlops_pipeline_base
 
         static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
                       "wrong!");
+#endif
     }
 
     // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
index a6b5e272ff..4cc1cf569d 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -139,9 +139,10 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
 
     using Base::AMmaKStride;
     using Base::BMmaKStride;
+    using Base::WaveSize;
 
     static constexpr index_t WgpPerCU =
-        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
+        (4 * WaveSize / BlockSize) >= 1 ? 4 * WaveSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
@@ -625,13 +626,14 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
 
     using Base::a_block_desc_m0_m1_m2_k;
     using Base::b_block_desc_n0_n1_n2_k;
+    using Base::WaveSize;
 
     static constexpr index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS;
     static constexpr index_t KPerInnerLoop  = math::max(KPerThread / NumMacClusters, KPack);
     static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
 
     static constexpr index_t WgpPerCU =
-        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
+        (4 * WaveSize / BlockSize) >= 1 ? 4 * WaveSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
index 0c030030fe..119f8a3306 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -141,9 +141,10 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
 
     using Base::AMmaKStride;
     using Base::BMmaKStride;
+    using Base::WaveSize;
 
     static constexpr index_t WgpPerCU =
-        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
+        (4 * WaveSize / BlockSize) >= 1 ? 4 * WaveSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
index 69002d7962..80c65515e8 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -139,9 +139,10 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intra
 
     using Base::AMmaKStride;
     using Base::BMmaKStride;
+    using Base::WaveSize;
 
     static constexpr index_t WgpPerCU =
-        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
+        (4 * WaveSize / BlockSize) >= 1 ? 4 * WaveSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
@@ -626,13 +627,14 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
 
     using Base::a_block_desc_m0_m1_m2_k;
     using Base::b_block_desc_n0_n1_n2_k;
+    using Base::WaveSize;
 
     static constexpr index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS;
     static constexpr index_t KPerInnerLoop  = math::max(KPerThread / NumMacClusters, KPack);
     static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
 
     static constexpr index_t WgpPerCU =
-        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
+        (4 * WaveSize / BlockSize) >= 1 ? 4 * WaveSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
index b5d6180ab3..7203348418 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -159,6 +159,7 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
 
     __device__ static constexpr auto HotLoopScheduler()
     {
+#if !defined(__gfx11__) && !defined(__gfx12__)
         // A/B split schedule
         // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
         constexpr auto num_ds_read_inst_a =
@@ -260,6 +261,7 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
             }
             __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
         });
+#endif
     }
 
     template <bool HasMainLoop,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
index d5fec7201a..f444399812 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
@@ -231,11 +231,22 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
                 }
             };
 
-            constexpr index_t minimum_occupancy =
-                (BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave &&
-                 MPerBlock * NPerBlock / BlockSize > 64)
-                    ? 1
-                    : 2;
+            constexpr index_t minimum_occupancy = [&]() {
+                if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout> &&
+                             is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+                {
+                    // FIXME: many instances have many spills with occupancy > 1, a better solution
+                    // needed to get best performance
+                    return 1;
+                }
+                else
+                {
+                    return (BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave &&
+                            MPerBlock * NPerBlock / BlockSize > 64)
+                               ? 1
+                               : 2;
+                }
+            }();
 
             if(has_main_k_block_loop)
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index dde21725d0..1cb82d24eb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -176,8 +176,36 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                                                        BElementwiseOperation,
                                                        CElementwiseOperation>
 {
+    template <bool isWave64>
+    static constexpr auto GetNXdlPerWave()
+    {
+        constexpr index_t Waves  = isWave64 ? BlockSize / 64 : BlockSize / 32;
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXDL);
+        static_assert(MWaves > 0);
+
+        constexpr index_t NWaves = Waves / MWaves;
+        if constexpr(NWaves == 0)
+        {
+            return 0;
+        }
+        else
+        {
+            if constexpr(NPerBlock % (NPerXDL * NWaves) == 0)
+            {
+                return NPerBlock / (NWaves * NPerXDL);
+            }
+            else
+            {
+                return 0;
+            }
+        }
+    }
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<
+    static constexpr auto NXdlPerWave64 = GetNXdlPerWave<true>();
+    static constexpr auto NXdlPerWave32 = GetNXdlPerWave<false>();
+
+    template <index_t NXdlPerWave_>
+    using GridwiseGemmBase = GridwiseGemm_xdl_cshuffle_v3<
         ALayout,
         BLayout,
         CLayout,
@@ -199,7 +227,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave,
+        NXdlPerWave_,
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
@@ -226,8 +254,10 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
         ComputeTypeB,
         PermuteA,
         PermuteB>;
+    using GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)>;
+    using GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32>;
 
-    using Argument = typename GridwiseGemm::Argument;
+    using Argument = typename GridwiseGemm64::Argument;
 
     static constexpr index_t APackedSize = []() {
         if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
@@ -254,12 +284,9 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
     ///
     struct Invoker : public BaseInvoker
     {
-        /// @brief  This function issues GPU kernel execution.
-        /// @param arg           The GPU kernel arguments.
-        /// @param stream_config The HIP stream configuration helper structure.
-        /// @return              The kernel's average execution time (if time measurement is
-        ///                      enabled).
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        template <typename GridwiseGemm>
+        float RunImp(const typename GridwiseGemm::Argument& arg,
+                     const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
             {
@@ -285,7 +312,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
-                    Argument arg_ = arg;
+                    auto arg_ = arg;
 
                     const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
                         arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
@@ -297,7 +324,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                     auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
                                          sizeof(BDataType) / BPackedSize;
 
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
                     rotating_mem.Print();
 
@@ -733,6 +760,31 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
             return ave_time;
         }
 
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(get_warp_size() == 64)
+            {
+                if constexpr(NXdlPerWave64 > 0)
+                {
+                    return RunImp<GridwiseGemm64>(arg, stream_config);
+                }
+            }
+            else
+            {
+                if constexpr(NXdlPerWave32 > 0)
+                {
+                    return RunImp<GridwiseGemm32>(
+                        reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg),
+                        stream_config);
+                }
+            }
+            return 0;
+        }
         // polymorphic
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
@@ -754,9 +806,39 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
             return false;
         }
 
-        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        if(arg.KBatch > 1)
         {
-            return false;
+            if(is_gfx11_supported())
+            {
+                return false;
+            }
+
+            if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t>)
+            {
+                return false;
+            }
+
+            if(sizeof(CDataType) == 1)
+            {
+                return false;
+            }
+        }
+
+        if(is_gfx11_supported() || is_gfx12_supported())
+        {
+            if(MPerXDL != 16 || NPerXDL != 16)
+            {
+                return false;
+            }
+        }
+
+        if(is_gfx11_supported())
+        {
+            if constexpr(std::is_same_v<ADataType, ck::f8_t> ||
+                         std::is_same_v<ADataType, ck::bf8_t>)
+            {
+                return false;
+            }
         }
 
         if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
@@ -767,7 +849,29 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
             return false;
         }
 
-        return GridwiseGemm::CheckValidity(arg);
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                return GridwiseGemm64::CheckValidity(arg);
+            }
+            else
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                return GridwiseGemm32::CheckValidity(
+                    reinterpret_cast<const typename GridwiseGemm32::Argument&>(arg));
+            }
+            else
+            {
+                return false;
+            }
+        }
     }
 
     // polymorphic
@@ -849,6 +953,25 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
             {BlockGemmPipelineVersion::v4, "v4"},
             {BlockGemmPipelineVersion::v5, "v5"}};
 
+        index_t PrefetchStages = 0;
+        index_t AMmaKStride    = 0;
+        if(get_warp_size() == 64)
+        {
+            if constexpr(NXdlPerWave64 > 0)
+            {
+                PrefetchStages = GridwiseGemm64::BlockwiseGemmPipe::PrefetchStages;
+                AMmaKStride    = GridwiseGemm64::BlockwiseGemmPipe::AMmaKStride;
+            }
+        }
+        else
+        {
+            if constexpr(NXdlPerWave32 > 0)
+            {
+                PrefetchStages = GridwiseGemm32::BlockwiseGemmPipe::PrefetchStages;
+                AMmaKStride    = GridwiseGemm32::BlockwiseGemmPipe::AMmaKStride;
+            }
+        }
+
         // clang-format off
         str << "DeviceGemmXdlUniversal"
             << "<"
@@ -872,9 +995,9 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
             << "BlkGemmPipelineVersion: "
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << PrefetchStages << ", "
             << "Kpack: "
-            << GridwiseGemm::BlockwiseGemmPipe::AMmaKStride;
+            << AMmaKStride;
         // clang-format on
 
         return str.str();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index 1cd1f16245..6e74899706 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -1299,13 +1299,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         if constexpr(ConvBackwardWeightSpecialization ==
                      ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
         {
-// workaround: disable when K, C is even
-#if CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN
-            if(arg.Conv_C_ % 2 == 0 || arg.Conv_K_ % 2 == 0)
-            {
-                return false;
-            }
-#endif
             // check if it's 1x1, stride=1 pad = 0 conv
             for(int i = 0; i < NDimSpatial; i++)
             {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 8fea287941..4c07d60b0f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -35,17 +35,20 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#if defined(__gfx9__) || defined(__gfx12__) || defined(__gfx11__)
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared,
-        karg);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -63,21 +66,24 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
-#if defined(__gfx9__)
+#if defined(__gfx9__) || defined(__gfx12__) || defined(__gfx11__)
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
+    {
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared_0,
-        p_shared_1,
-        karg);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared_0,
+            p_shared_1,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -666,12 +672,23 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
-                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
-                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
-                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
+            // clang-format off
+            std::cout << "problem {" 
+                      << "M:" << M << ", " 
+                      << "N:" << N << ", " 
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " 
+                      << "SB:" << StrideB << ", " 
+                      << "SC:" << StrideC << ", " 
+                      << "MP:" << MPadded << ", " 
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " 
+                      << "KP:" << KPadded << ", " 
+                      << "AK0:" << AK0 << ", " 
+                      << "BK0:" << BK0 << ", " 
+                      << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
+            // clang-format off
         }
 
         index_t M;
@@ -801,6 +818,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves = (NXdlPerWave * NPerXdl == 0) ? 0 : NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = (MWaves * NWaves == 0) ? 64 : BlockSize / (MWaves * NWaves);
+
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -858,7 +879,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
@@ -939,6 +960,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves = (NXdlPerWave * NPerXdl == 0) ? 0 : NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr index_t WaveSize = (MWaves * NWaves == 0) ? 64 : BlockSize / (MWaves * NWaves);
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
@@ -992,7 +1016,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
             constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
 
             constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
@@ -1139,12 +1163,99 @@ struct GridwiseGemm_xdl_cshuffle_v3
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    template <InMemoryDataOperationEnum CGlobalMemoryDataOperation>
+    __device__ static bool constexpr IsValidCompilationParameter()
+    {
+        enum struct Arch : bool
+        {
+#if defined(__gfx950__)
+            is_gfx950_build = true,
+#else
+            is_gfx950_build = false,
+#endif
+        };
+        
+        // skip building the instances with K1>=32 && PackedSize != 2 on pre-gfx950
+        if constexpr(static_cast<bool>(Arch::is_gfx950_build) ||
+                    (AK1Number < 32 && BK1Number < 32) ||
+                    (AK1Number >= 32 && APackedSize == 2) ||
+                    (BK1Number >= 32 && BPackedSize == 2))
+        {
+        
+        }
+        else
+        {
+            return false;
+        }
+
+        // Check tile size
+#if defined(__gfx11__) || defined(__gfx12__)
+        if constexpr(MPerXdl != 16 || NPerXdl != 16)
+        {
+            return false;
+        }
+#endif
+        // Check atomic caps
+#if defined(__gfx11__)
+        constexpr bool SupportMemOp = CGlobalMemoryDataOperation == InMemoryDataOperationEnum::Set;
+#else
+        constexpr bool SupportMemOp = sizeof(CDataType) >= 2 || (CGlobalMemoryDataOperation ==
+                                                                 InMemoryDataOperationEnum::Set);
+#endif
+        if constexpr(SupportMemOp == false)
+        {
+            return false;
+        }
+
+        // Check tile size
+        if constexpr(MXdlPerWave > 0 && NXdlPerWave > 0)
+        {
+            constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+            if constexpr(MWaves > 0 && NWaves > 0)
+            {
+                constexpr index_t WaveSize = BlockSize / (MWaves * NWaves);
+                if constexpr(WaveSize == get_warp_size())
+                {
+                    return true;
+                }
+                else
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+    }
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
     __host__ static constexpr bool CheckValidity(const Argument& karg)
     {
-        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
-                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
-                      "Invalid tuning param!");
+        if constexpr((MPerXdl * MXdlPerWave) == 0 || (NXdlPerWave * NPerXdl) == 0)
+        {
+            return false;
+        }
+        else
+        {
+            if constexpr((MPerBlock % (MPerXdl * MXdlPerWave) != 0) ||
+                         (NPerBlock % (NXdlPerWave * NPerXdl) != 0))
+            {
+                return false;
+            }
+            else
+            {
+                if(BlockwiseGemmPipe::WaveSize != get_warp_size())
+                {
+                    return false;
+                }
+            }
+        }
 
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index 93ec6ca31e..e80a3702fb 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -402,6 +402,34 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         }
     }
 
+    __host__ __device__ static constexpr auto MakeAScaleGridDesciptor_M_K(index_t M, index_t K)
+    {
+        const auto BM = math::integer_divide_ceil(M, ScaleBlockM);
+        const auto BK = math::integer_divide_ceil(K, ScaleBlockK);
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            return make_naive_tensor_descriptor(make_tuple(BM, BK), make_tuple(BK, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+        {
+            return make_naive_tensor_descriptor(make_tuple(BM, BK), make_tuple(I1, BM));
+        }
+    }
+
+    __host__ __device__ static constexpr auto MakeBScaleGridDesciptor_N_K(index_t N, index_t K)
+    {
+        const auto BN = math::integer_divide_ceil(N, ScaleBlockN);
+        const auto BK = math::integer_divide_ceil(K, ScaleBlockK);
+        if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+        {
+            return make_naive_tensor_descriptor(make_tuple(BN, BK), make_tuple(BK, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            return make_naive_tensor_descriptor(make_tuple(BN, BK), make_tuple(I1, BN));
+        }
+    }
+
     template <typename ABlockDesc_AK0_M_AK1>
     __host__ __device__ static constexpr auto
     MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
@@ -1181,14 +1209,8 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
 
-        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
-            make_tuple(math::integer_divide_ceil(problem.M, ScaleBlockM),
-                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
-            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
-        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
-            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
-                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
-            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+        const auto a_scale_grid_desc_am_ak = MakeAScaleGridDesciptor_M_K(problem.M, problem.K);
+        const auto b_scale_grid_desc_bn_ak = MakeBScaleGridDesciptor_N_K(problem.N, problem.K);
 
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 64d7f92750..2ce08e7044 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -6,6 +6,7 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/utility/math.hpp"
 #include "ck/utility/amd_xdlops.hpp"
+#include "ck/utility/amd_wmma.hpp"
 
 namespace ck {
 /**
@@ -76,7 +77,21 @@ enum struct MfmaInstr
     mfma_f32_32x32x64f8f6f4,
     mfma_f32_16x16x128f8f6f4,
     mfma_scale_f32_32x32x64f8f6f4,
-    mfma_scale_f32_16x16x128f8f6f4
+    mfma_scale_f32_16x16x128f8f6f4,
+    // gfx11
+    wmma_f32_16x16x16_f16,
+    wmma_f32_16x16x16_bf16,
+    wmma_i32_16x16x16_iu8,
+    wmma_unsupport_16x16_gfx11,
+    // gfx12
+    wmma_f32_16x16x16_f16_gfx12,
+    wmma_f32_16x16x16_bf16_gfx12,
+    wmma_i32_16x16x16_iu8_gfx12,
+    wmma_f32_16x16x16_f8f8_gfx12,
+    wmma_f32_16x16x16_f8bf8_gfx12,
+    wmma_f32_16x16x16_bf8f8_gfx12,
+    wmma_f32_16x16x16_bf8bf8_gfx12,
+    wmma_unsupport_16x16_gfx12,
 };
 
 template <MfmaInstr instr>
@@ -932,6 +947,175 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
     }
 };
 
+// gfx11
+struct mfma_type_gfx11_base
+{
+    static constexpr index_t group_size          = 8;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 8;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 32;
+    static constexpr index_t num_input_blks      = 1;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 16;
+    static constexpr bool is_k_reduction         = true;
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_f32_16x16x16_f16> : public mfma_type_gfx11_base
+{
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_f32_16x16x16_f16_w32<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_f32_16x16x16_bf16> : public mfma_type_gfx11_base
+{
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_f32_16x16x16_bf16_w32<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_i32_16x16x16_iu8> : public mfma_type_gfx11_base
+{
+    template <index_t MPerWmma,
+              index_t NPerWmma,
+              class FloatA,
+              class FloatB,
+              class FloatC,
+              bool neg_a = true,
+              bool neg_b = true,
+              bool clamp = false>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_i32_16x16x16_iu8_w32<MPerWmma, NPerWmma, neg_a, neg_b, clamp>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_unsupport_16x16_gfx11> : public mfma_type_gfx11_base
+{
+    static constexpr index_t k_per_blk = 2;
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA&, const FloatB&, FloatC&) const
+    {
+        // empty for all unsupported types.
+    }
+};
+
+// gfx12
+struct mfma_type_gfx12_base
+{
+    static constexpr index_t group_size          = 8;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 8;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 32;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_f32_16x16x16_f16_gfx12> : public mfma_type_gfx12_base
+{
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_f32_16x16x16_f16_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_f32_16x16x16_bf16_gfx12> : public mfma_type_gfx12_base
+{
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_f32_16x16x16_bf16_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_i32_16x16x16_iu8_gfx12> : public mfma_type_gfx12_base
+{
+    template <index_t MPerWmma,
+              index_t NPerWmma,
+              class FloatA,
+              class FloatB,
+              class FloatC,
+              bool neg_a = true,
+              bool neg_b = true,
+              bool clamp = false>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_i32_16x16x16_iu8_w32_gfx12<MPerWmma, NPerWmma, neg_a, neg_b, clamp>::Run(
+            a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_f32_16x16x16_f8f8_gfx12> : public mfma_type_gfx12_base
+{
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_f32_16x16x16_f8f8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_f32_16x16x16_f8bf8_gfx12> : public mfma_type_gfx12_base
+{
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_f32_16x16x16_bf8f8_gfx12> : public mfma_type_gfx12_base
+{
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12> : public mfma_type_gfx12_base
+{
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::wmma_unsupport_16x16_gfx12> : public mfma_type_gfx12_base
+{
+    static constexpr index_t k_per_blk = 2;
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA&, const FloatB&, FloatC&) const
+    {
+        // empty for all unsupported types.
+    }
+};
+
 template <typename base_type,
           index_t MPerXdlops,
           index_t NPerXdlops,
@@ -951,7 +1135,13 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<double, 16, 16>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_f64_16x16x4f64;
+#endif
     }
 
     template <>
@@ -993,7 +1183,13 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<float, 16, 16>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_f32_16x16x4xf32;
+#endif
     }
 
     template <>
@@ -1026,7 +1222,11 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<half_t, 16, 16, half_t, false>()
     {
-#if defined(__gfx950__)
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_f16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_f32_16x16x16_f16;
+#elif defined(__gfx950__)
         return MfmaInstr::mfma_f32_16x16x32f16;
 #else
         return MfmaInstr::mfma_f32_16x16x16f16;
@@ -1036,7 +1236,13 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<half_t, 16, 16, half_t, true>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_f16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_f32_16x16x16_f16;
+#else
         return MfmaInstr::mfma_f32_16x16x16f16;
+#endif
     }
 
     template <>
@@ -1082,7 +1288,11 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<bhalf_t, 16, 16, bhalf_t, false>()
     {
-#if defined(__gfx950__)
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_bf16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_f32_16x16x16_bf16;
+#elif defined(__gfx950__)
         return MfmaInstr::mfma_f32_16x16x32bf16;
 #elif defined(CK_USE_AMD_MFMA_BF16_1K_OP)
         return MfmaInstr::mfma_f32_16x16x16bf16_1k;
@@ -1094,7 +1304,11 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<bhalf_t, 16, 16, bhalf_t, true>()
     {
-#if defined(CK_USE_AMD_MFMA_BF16_1K_OP)
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_bf16_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_f32_16x16x16_bf16;
+#elif defined(CK_USE_AMD_MFMA_BF16_1K_OP)
         return MfmaInstr::mfma_f32_16x16x16bf16_1k;
 #else
         return MfmaInstr::mfma_f32_16x16x8bf16;
@@ -1126,7 +1340,11 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<int8_t, 16, 16, int8_t, false>()
     {
-#if defined(__gfx950__)
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_i32_16x16x16_iu8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_i32_16x16x16_iu8;
+#elif defined(__gfx950__)
         return MfmaInstr::mfma_i32_16x16x64i8;
 #elif defined(__gfx942__)
         return MfmaInstr::mfma_i32_16x16x32i8;
@@ -1138,7 +1356,11 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<int8_t, 16, 16, int8_t, true>()
     {
-#if defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_i32_16x16x16_iu8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_i32_16x16x16_iu8;
+#elif defined(__gfx942__) || defined(__gfx950__)
         return MfmaInstr::mfma_i32_16x16x32i8;
 #else
         return MfmaInstr::mfma_i32_16x16x16i8;
@@ -1186,13 +1408,23 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<f8_t, 16, 16, f8_t, true, false>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_f8f8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_f32_16x16x32f8f8;
+#endif
     }
 
     template <>
     constexpr auto GetMfma<f8_t, 16, 16, f8_t, false, false>()
     {
-#if defined(__gfx950__)
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_f8f8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#elif defined(__gfx950__)
         return MfmaInstr::mfma_f32_16x16x128f8f6f4;
 #else
         return MfmaInstr::mfma_f32_16x16x32f8f8;
@@ -1263,13 +1495,23 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<bf8_t, 16, 16, bf8_t, true, false>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_f32_16x16x32bf8bf8;
+#endif
     }
 
     template <>
     constexpr auto GetMfma<bf8_t, 16, 16, bf8_t, false, false>()
     {
-#if defined(__gfx950__)
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#elif defined(__gfx950__)
         return MfmaInstr::mfma_f32_16x16x128f8f6f4;
 #else
         return MfmaInstr::mfma_f32_16x16x32bf8bf8;
@@ -1295,13 +1537,23 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<f8_t, 16, 16, bf8_t, true, false>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_f8bf8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_f32_16x16x32f8bf8;
+#endif
     }
 
     template <>
     constexpr auto GetMfma<f8_t, 16, 16, bf8_t, false, false>()
     {
-#if defined(__gfx950__)
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_f8bf8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#elif defined(__gfx950__)
         return MfmaInstr::mfma_f32_16x16x128f8f6f4;
 #else
         return MfmaInstr::mfma_f32_16x16x32f8bf8;
@@ -1327,13 +1579,23 @@ struct MfmaSelector
     template <>
     constexpr auto GetMfma<bf8_t, 16, 16, f8_t, true, false>()
     {
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_bf8f8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#else
         return MfmaInstr::mfma_f32_16x16x32bf8f8;
+#endif
     }
 
     template <>
     constexpr auto GetMfma<bf8_t, 16, 16, f8_t, false, false>()
     {
-#if defined(__gfx950__)
+#if defined(__gfx12__)
+        return MfmaInstr::wmma_f32_16x16x16_bf8f8_gfx12;
+#elif defined(__gfx11__)
+        return MfmaInstr::wmma_unsupport_16x16_gfx11;
+#elif defined(__gfx950__)
         return MfmaInstr::mfma_f32_16x16x128f8f6f4;
 #else
         return MfmaInstr::mfma_f32_16x16x32bf8f8;
@@ -1355,10 +1617,18 @@ struct MfmaSelector
 
         static_assert(selected_mfma.num_threads_per_blk == selected_mfma.n_per_blk,
                       "n_per_blk != num_threads_per_blk");
-
+#if defined(__gfx11__)
+        if constexpr(MPerXdlops == 16 && NPerXdlops == 16)
+        {
+            static_assert(selected_mfma.num_regs_per_blk * selected_mfma.num_input_blks * 2 ==
+                              selected_mfma.m_per_blk,
+                          "m_per_blk != num_input_blks * num_regs_per_blk");
+        }
+#else
         static_assert(selected_mfma.num_regs_per_blk * selected_mfma.num_input_blks ==
                           selected_mfma.m_per_blk,
                       "m_per_blk != num_input_blks * num_regs_per_blk");
+#endif
 
         static_assert(selected_mfma.num_output_blks == selected_mfma.num_input_blks ||
                           selected_mfma.num_output_blks == 1,
@@ -1424,8 +1694,9 @@ struct XdlopsGemm
         static_assert(MPerXdlops == 4 || MPerXdlops == 8 || MPerXdlops == 16 || MPerXdlops == 32 ||
                           MPerXdlops == 64,
                       "Only support GemmMPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
-
+#if defined(__HIP_DEVICE_COMPILE__)
         static_assert(KPack % mfma_instr.k_per_blk == 0, "KPack should be a multiple of k_per_blk");
+#endif
     }
 
     // XDL output supporting C = A * B
@@ -1434,10 +1705,11 @@ struct XdlopsGemm
     __host__ __device__ static constexpr auto
     MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2)
     {
-        const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
-        const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
-        const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
-        const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        const auto M0           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto N0           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto M1           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto N1           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        constexpr auto num_blks = mfma_instr.m_per_blk / mfma_instr.num_regs_per_blk;
 
         return transform_tensor_descriptor(
             c_desc_m0_n0_m1_n1_m2_n2,
@@ -1446,7 +1718,7 @@ struct XdlopsGemm
                        make_pass_through_transform(M1),
                        make_pass_through_transform(N1),
                        make_unmerge_transform(make_tuple(Number<mfma_instr.num_groups_per_blk>{},
-                                                         Number<mfma_instr.num_input_blks>{},
+                                                         Number<num_blks>{},
                                                          Number<mfma_instr.group_size>{})),
                        make_pass_through_transform(Number<mfma_instr.num_threads_per_blk>{})),
             make_tuple(Sequence<0>{},
@@ -1469,12 +1741,13 @@ struct XdlopsGemm
     __host__ __device__ static constexpr auto MakeCDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3(
         const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2)
     {
-        const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
-        const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
-        const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
-        const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
-        const auto M2 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I4);
-        const auto N2 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I5);
+        const auto M0           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto N0           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto M1           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto N1           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        const auto M2           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I4);
+        const auto N2           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I5);
+        constexpr auto num_blks = mfma_instr.m_per_blk / mfma_instr.num_regs_per_blk;
 
         return transform_tensor_descriptor(
             c_desc_m0_n0_m1_n1_m2_n2,
@@ -1485,7 +1758,7 @@ struct XdlopsGemm
                        make_pass_through_transform(M2),
                        make_pass_through_transform(N2),
                        make_unmerge_transform(make_tuple(Number<mfma_instr.num_groups_per_blk>{},
-                                                         Number<mfma_instr.num_input_blks>{},
+                                                         Number<num_blks>{},
                                                          Number<mfma_instr.group_size>{})),
                        make_pass_through_transform(Number<mfma_instr.num_threads_per_blk>{})),
             make_tuple(Sequence<0>{},
@@ -1512,10 +1785,11 @@ struct XdlopsGemm
     __host__ __device__ static constexpr auto
     MakeCDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2)
     {
-        const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
-        const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
-        const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
-        const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        const auto M0           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto N0           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto M1           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto N1           = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        constexpr auto num_blks = mfma_instr.m_per_blk / mfma_instr.num_regs_per_blk;
 
         return transform_tensor_descriptor(
             c_desc_m0_n0_m1_n1_m2_n2,
@@ -1525,7 +1799,7 @@ struct XdlopsGemm
                        make_pass_through_transform(N1),
                        make_pass_through_transform(Number<mfma_instr.num_threads_per_blk>{}),
                        make_unmerge_transform(make_tuple(Number<mfma_instr.num_groups_per_blk>{},
-                                                         Number<mfma_instr.num_input_blks>{},
+                                                         Number<num_blks>{},
                                                          Number<mfma_instr.group_size>{}))),
             make_tuple(Sequence<0>{},
                        Sequence<1>{},
@@ -1545,11 +1819,12 @@ struct XdlopsGemm
     __host__ __device__ static constexpr auto MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
         const CDesc_G_M0_N0_M1_N1_M2_N2& c_desc_g_m0_n0_m1_n1_m2_n2)
     {
-        const auto G  = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I0);
-        const auto M0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I1);
-        const auto N0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I2);
-        const auto M1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I3);
-        const auto N1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I4);
+        const auto G            = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto M0           = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto N0           = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto M1           = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        const auto N1           = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I4);
+        constexpr auto num_blks = mfma_instr.m_per_blk / mfma_instr.num_regs_per_blk;
 
         return transform_tensor_descriptor(
             c_desc_g_m0_n0_m1_n1_m2_n2,
@@ -1558,9 +1833,8 @@ struct XdlopsGemm
                        make_pass_through_transform(N0),
                        make_pass_through_transform(M1),
                        make_pass_through_transform(N1),
-                       make_unmerge_transform(make_tuple(mfma_instr.num_groups_per_blk,
-                                                         mfma_instr.num_input_blks,
-                                                         mfma_instr.group_size)),
+                       make_unmerge_transform(make_tuple(
+                           mfma_instr.num_groups_per_blk, num_blks, mfma_instr.group_size)),
                        make_pass_through_transform(mfma_instr.num_threads_per_blk)),
             make_tuple(Sequence<0>{},
                        Sequence<1>{},
@@ -1642,8 +1916,32 @@ struct XdlopsGemm
 
     __device__ static auto GetBlkIdx()
     {
-        const auto laneId = GetLaneId();
+        const auto laneId       = GetLaneId();
+        constexpr auto num_blks = mfma_instr.m_per_blk / mfma_instr.num_regs_per_blk;
 
+        constexpr auto threadidx_to_blk_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(
+                make_merge_transform(make_tuple(1, num_blks, mfma_instr.num_threads_per_blk))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto blk_idx =
+            threadidx_to_blk_idx_adaptor.CalculateBottomIndex(make_multi_index(laneId));
+
+        const auto blk_id = blk_idx[I1];
+        const auto blk_td = blk_idx[I2];
+
+        return make_tuple(blk_id, blk_td);
+    }
+
+    template <bool SwizzleA>
+    __device__ static auto GetGfx11InputBlkIdx()
+    {
+        const auto laneId = GetLaneId() % mfma_instr.num_threads_per_blk;
+        if constexpr(SwizzleA)
+        {
+            laneId = ((laneId & 1) << 3) | (laneId >> 1);
+        }
         constexpr auto threadidx_to_blk_idx_adaptor = make_single_stage_tensor_adaptor(
             make_tuple(make_merge_transform(
                 make_tuple(1, mfma_instr.num_input_blks, mfma_instr.num_threads_per_blk))),
@@ -1661,8 +1959,12 @@ struct XdlopsGemm
 
     __host__ __device__ static auto CalculateAThreadOriginDataIndex()
     {
-        const auto laneId  = GetLaneId();
+        const auto laneId = GetLaneId();
+#if defined(__gfx11__)
+        const auto blk_idx = GetGfx11InputBlkIdx<true>();
+#else
         const auto blk_idx = GetBlkIdx();
+#endif
 
         const auto blk_id = blk_idx[I0];
         const auto blk_td = blk_idx[I1];
@@ -1679,8 +1981,12 @@ struct XdlopsGemm
 
     __host__ __device__ static auto CalculateBThreadOriginDataIndex()
     {
-        const auto laneId  = GetLaneId();
+        const auto laneId = GetLaneId();
+#if defined(__gfx11__)
+        const auto blk_idx = GetGfx11InputBlkIdx<false>();
+#else
         const auto blk_idx = GetBlkIdx();
+#endif
 
         const auto blk_id = blk_idx[I0];
         const auto blk_td = blk_idx[I1];
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
index bd3ab10802..efc7f20cdc 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
@@ -192,7 +192,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -210,7 +210,7 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -218,9 +218,17 @@ struct TransformConvBwdWeightToGemm
             const auto wei_gemmm_gemmn_grid_desc =
                 make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -240,7 +248,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -279,7 +287,7 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -288,26 +296,6 @@ struct TransformConvBwdWeightToGemm
                 make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -315,8 +303,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -392,7 +380,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -407,13 +395,21 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -428,7 +424,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -469,31 +465,11 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -501,8 +477,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -585,7 +561,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -600,13 +576,21 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -621,7 +605,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -671,31 +655,11 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -703,8 +667,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     } // function end
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
index b72ddb8243..e410f06190 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
@@ -374,7 +374,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -390,13 +390,21 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -412,7 +420,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -453,29 +461,11 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -483,8 +473,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
 
@@ -562,7 +552,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -578,13 +568,21 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -600,7 +598,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -650,29 +648,11 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -680,8 +660,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -765,7 +745,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -781,13 +761,21 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -803,7 +791,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -868,29 +856,11 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -898,8 +868,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     } // function end
diff --git a/include/ck/utility/blkgemmpipe_scheduler.hpp b/include/ck/utility/blkgemmpipe_scheduler.hpp
index 861b81b1f6..63466a36f2 100644
--- a/include/ck/utility/blkgemmpipe_scheduler.hpp
+++ b/include/ck/utility/blkgemmpipe_scheduler.hpp
@@ -75,9 +75,9 @@ template <index_t BlockSize,
           bool IsF4F6 = false>
 struct BlockwiseGemmXdlops_pipeline_hotloop_inst
 {
-    static constexpr index_t WaveSize = 64;
     static constexpr index_t WaveNumM = MPerBlock / (MRepeat * MPerXDL);
     static constexpr index_t WaveNumN = NPerBlock / (NRepeat * NPerXDL);
+    static constexpr index_t WaveSize = BlockSize / WaveNumM / WaveNumN;
 
     static constexpr index_t A_LDS_Read_Width = ALDSReadWidth;
     static constexpr index_t B_LDS_Read_Width = BLDSReadWidth;
diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp
index fd0d1024b2..53e865767b 100644
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,38 @@
 
 namespace ck {
 
+#if defined(CK_ENABLE_DYNAMIC_WARP_SIZE)
+__device__ constexpr index_t get_warp_size()
+{
+#if defined(__HIP_DEVICE_COMPILE__)
+#if defined(__GFX9__)
+    return 64;
+#else
+    return 32;
+#endif
+#else
+    return 64;
+#endif
+}
+
+inline __host__ index_t get_warp_size()
+{
+#if !(defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC))
+    int device  = 0;
+    int result  = 0;
+    auto status = hipGetDevice(&device);
+    if(status == hipSuccess)
+    {
+        status = hipDeviceGetAttribute(&result, hipDeviceAttributeWarpSize, device);
+        if(status == hipSuccess)
+        {
+            return result;
+        }
+    }
+#endif
+    return 64;
+}
+#else
 __host__ __device__ constexpr index_t get_warp_size()
 {
 #if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
@@ -15,6 +47,7 @@ __host__ __device__ constexpr index_t get_warp_size()
     return 32;
 #endif
 }
+#endif
 
 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
 
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 35da19cd3e..037e86909d 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -41,10 +41,6 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
-    r.x         = __builtin_amdgcn_readfirstlane(r.x);
-    r.y         = __builtin_amdgcn_readfirstlane(r.y);
-    r.z         = __builtin_amdgcn_readfirstlane(r.z);
-    r.w         = __builtin_amdgcn_readfirstlane(r.w);
     return r;
 }
 
@@ -1280,26 +1276,46 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                 index_t offset,
                                 index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
 
-template <bool pre_nop = false>
-CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
-                                              int32x4_t rsrc,
-                                              index_t voffset,
-                                              index_t /*soffset*/,
-                                              index_t ioffset /*max 0xFFF*/,
-                                              index_t /*flag*/       = 0,
-                                              bool_constant<pre_nop> = {})
+template <unsigned num_dwords, bool pre_nop = false>
+CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem,
+                                                int32x4_t rsrc,
+                                                index_t voffset,
+                                                index_t /*soffset*/,
+                                                index_t ioffset /*max 0xFFF*/,
+                                                index_t /*flag*/       = 0,
+                                                bool_constant<pre_nop> = {})
 {
-    if constexpr(pre_nop)
-        asm volatile("s_nop 4\n"
-                     "buffer_load_dword %1, %2, 0 offen offset:%3 lds"
-                     : "=r"(smem) /*dummy dependency for smem*/
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr)                            \
+    if constexpr(pre_nop)                                               \
+        asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
+                     : "=r"(smem) /*dummy dependency for smem*/         \
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)            \
+                     : "memory");                                       \
+    else                                                                \
+        asm volatile(instr " %1, %2, 0 offen offset:%3 lds"             \
+                     : "=r"(smem) /*dummy dependency for smem*/         \
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)            \
                      : "memory");
+
+    if constexpr(num_dwords == 1)
+    {
+        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dword");
+    }
+#if defined(__gfx950__)
+    else if constexpr(num_dwords == 3)
+    {
+        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx3");
+    }
+    else if constexpr(num_dwords == 4)
+    {
+        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx4");
+    }
+#endif
     else
-        asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
-                     : "=r"(smem) /*dummy dependency for smem*/
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
-                     : "memory");
+    {
+        static_assert(false, "wrong! not implemented data width");
+    }
+#undef CK_TILE_ASYNC_LOAD_WITH_INSTR
 }
 
 CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
@@ -1318,6 +1334,17 @@ enum struct amd_buffer_coherence_enum
     glc               = 1,
     slc               = 2,
     glc_slc           = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
 };
 
 template <index_t N,
@@ -1759,15 +1786,18 @@ CK_TILE_DEVICE void amd_async_buffer_load_impl(CK_TILE_LDS_ADDR T* smem,
                                                index_t src_immediate_addr_offset = 0,
                                                bool_constant<pre_nop>            = {})
 {
-    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
+    constexpr index_t num_bytes = sizeof(T) * N;
+    constexpr index_t num_words = num_bytes / 4;
+    static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4),
+                  "wrong! only support in dword, dwordx3, dwordx4");
 
-    async_buffer_load_dword_v(smem,
-                              src_wave_buffer_resource,
-                              src_thread_addr_offset,
-                              src_wave_addr_offset,
-                              src_immediate_addr_offset,
-                              0,
-                              bool_constant<pre_nop>{});
+    async_buffer_load_dwordxn_v<num_words>(smem,
+                                           src_wave_buffer_resource,
+                                           src_thread_addr_offset,
+                                           src_wave_addr_offset,
+                                           src_immediate_addr_offset,
+                                           0,
+                                           bool_constant<pre_nop>{});
 }
 
 template <typename T,
@@ -2756,7 +2786,7 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
+__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 
     static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 8c3bc0bc36..d1e4eb3da3 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -32,10 +32,6 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
-    r.x         = __builtin_amdgcn_readfirstlane(r.x);
-    r.y         = __builtin_amdgcn_readfirstlane(r.y);
-    r.z         = __builtin_amdgcn_readfirstlane(r.z);
-    r.w         = __builtin_amdgcn_readfirstlane(r.w);
     return r;
 }
 
@@ -1148,26 +1144,46 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                 index_t offset,
                                 index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
 
-template <bool pre_nop = false>
-CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
-                                              int32x4_t rsrc,
-                                              index_t voffset,
-                                              index_t /*soffset*/,
-                                              index_t ioffset /*max 0xFFF*/,
-                                              index_t /*flag*/       = 0,
-                                              bool_constant<pre_nop> = {})
+template <unsigned num_dwords, bool pre_nop = false>
+CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem,
+                                                int32x4_t rsrc,
+                                                index_t voffset,
+                                                index_t /*soffset*/,
+                                                index_t ioffset /*max 0xFFF*/,
+                                                index_t /*flag*/       = 0,
+                                                bool_constant<pre_nop> = {})
 {
-    if constexpr(pre_nop)
-        asm volatile("s_nop 4\n"
-                     "buffer_load_dword %1, %2, 0 offen offset:%3 lds"
-                     : "=r"(smem) /*dummy dependency for smem*/
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr)                            \
+    if constexpr(pre_nop)                                               \
+        asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
+                     : "=r"(smem) /*dummy dependency for smem*/         \
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)            \
+                     : "memory");                                       \
+    else                                                                \
+        asm volatile(instr " %1, %2, 0 offen offset:%3 lds"             \
+                     : "=r"(smem) /*dummy dependency for smem*/         \
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)            \
                      : "memory");
+
+    if constexpr(num_dwords == 1)
+    {
+        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dword");
+    }
+#if defined(__gfx950__)
+    else if constexpr(num_dwords == 3)
+    {
+        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx3");
+    }
+    else if constexpr(num_dwords == 4)
+    {
+        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx4");
+    }
+#endif
     else
-        asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
-                     : "=r"(smem) /*dummy dependency for smem*/
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
-                     : "memory");
+    {
+        static_assert(false, "wrong! not implemented data width");
+    }
+#undef CK_TILE_ASYNC_LOAD_WITH_INSTR
 }
 
 CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
@@ -1186,6 +1202,17 @@ enum struct amd_buffer_coherence_enum
     glc               = 1,
     slc               = 2,
     glc_slc           = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
 };
 
 template <index_t N,
@@ -1529,15 +1556,18 @@ CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
                                                index_t src_immediate_addr_offset = 0,
                                                bool_constant<pre_nop>            = {})
 {
-    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
+    constexpr index_t num_bytes = sizeof(T) * N;
+    constexpr index_t num_words = num_bytes / 4;
+    static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4),
+                  "wrong! only support in dword, dwordx3, dwordx4");
 
-    async_buffer_load_dword_v(smem,
-                              src_wave_buffer_resource,
-                              src_thread_addr_offset,
-                              src_wave_addr_offset,
-                              src_immediate_addr_offset,
-                              0,
-                              bool_constant<pre_nop>{});
+    async_buffer_load_dwordxn_v<num_words>(smem,
+                                           src_wave_buffer_resource,
+                                           src_thread_addr_offset,
+                                           src_wave_addr_offset,
+                                           src_immediate_addr_offset,
+                                           0,
+                                           bool_constant<pre_nop>{});
 }
 
 template <typename T,
@@ -2574,7 +2604,7 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
+__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 
     static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index ab42ec8617..42f2390cde 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
 
 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
 #define CK_TILE_VMCNT(cnt)                                              \
@@ -66,6 +67,23 @@ CK_TILE_HOST_DEVICE constexpr index_t get_warp_size()
 #endif
 }
 
+CK_TILE_HOST bool is_wave32()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return false;
+    }
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return false;
+    }
+    return props.major > 9;
+}
+
 CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }
 
 CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }
@@ -80,30 +98,24 @@ CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }
 // Use these instead
 CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }
 
-CK_TILE_DEVICE index_t get_warp_id()
+template <bool ReturnSgpr = true>
+CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {})
 {
-    return __builtin_amdgcn_readfirstlane(threadIdx.x / get_warp_size());
+    const index_t warp_id = threadIdx.x / get_warp_size();
+    if constexpr(ReturnSgpr)
+    {
+        return __builtin_amdgcn_readfirstlane(warp_id);
+    }
+    else
+    {
+        return warp_id;
+    }
 }
 
 CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }
 
 CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
 
-CK_TILE_DEVICE void block_sync_lds()
-{
-#if CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
-    // asm volatile("\
-    // s_waitcnt lgkmcnt(0) \n \
-    // s_barrier \
-    // " ::);
-
-    __builtin_amdgcn_s_waitcnt(0xc07f);
-    __builtin_amdgcn_s_barrier();
-#else
-    __syncthreads();
-#endif
-}
-
 CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
 {
 #ifdef __gfx12__
@@ -174,6 +186,18 @@ CK_TILE_DEVICE void s_waitcnt_barrier()
     __builtin_amdgcn_s_barrier();
 }
 
+template <index_t lgkmcnt = 0>
+CK_TILE_DEVICE void block_sync_lds()
+{
+    s_waitcnt_barrier<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, lgkmcnt>();
+}
+
+template <index_t vmcnt = 0>
+CK_TILE_DEVICE void block_sync_lds_direct_load()
+{
+    s_waitcnt_barrier<vmcnt, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
+}
+
 CK_TILE_DEVICE void s_nop(index_t cnt = 0)
 {
 #if 1
@@ -233,4 +257,20 @@ CK_TILE_HOST_DEVICE constexpr const char* address_space_to_string(address_space_
     }
 }
 
+// Architecture tags
+struct gfx11_t
+{
+};
+struct gfx12_t
+{
+};
+
+CK_TILE_DEVICE static constexpr auto get_device_arch()
+{
+#if defined(__gfx11__)
+    return gfx11_t{};
+#else // if defined(__gfx12__)
+    return gfx12_t{};
+#endif
+}
 } // namespace ck_tile
diff --git a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
index 07c6aa0baf..c02c46958c 100644
--- a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
+++ b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
@@ -6,6 +6,10 @@
 #include "ck_tile/core/numeric/type_convert.hpp"
 #include "ck_tile/core/container/thread_buffer.hpp"
 
+#define HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN                        \
+    __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2f16) && \
+        __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2bf16)
+
 namespace ck_tile {
 
 template <typename T, typename ComputeType>
@@ -32,6 +36,14 @@ CK_TILE_HOST_DEVICE bf16x4_t add_bf16x4_t(const bf16x4_t& a, const bf16x4_t& b)
     return rtn;
 }
 
+CK_TILE_HOST_DEVICE fp16x2_t add_f16x2_t(const fp16x2_t& a, const fp16x2_t& b)
+{
+    fp16x2_t rtn;
+    rtn[0] = add<fp16_t, float>(a[0], b[0]);
+    rtn[1] = add<fp16_t, float>(a[1], b[1]);
+    return rtn;
+}
+
 CK_TILE_HOST_DEVICE fp8x4_t add_fp8x4_t(const fp8x4_t& a, const fp8x4_t& b)
 {
     fp8x4_t rtn;
@@ -304,6 +316,44 @@ CK_TILE_DEVICE void atomic_add<bf8x8_t>(bf8x8_t* p_dst, bf8x8_t const& x)
     } while(cur_v.u64 != old_v);
 }
 
+//
+// Atomic add for fp16x2_t
+//
+template <>
+CK_TILE_DEVICE void atomic_add<fp16x2_t>(fp16x2_t* p_dst, fp16x2_t const& x)
+{
+#if HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN
+    __builtin_amdgcn_global_atomic_fadd_v2f16(c_style_pointer_cast<fp16x2_t*>(p_dst), x);
+#else
+    union U32F162_ADDR
+    {
+        uint32_t* u32_a;
+        fp16x2_t* f162_a;
+    };
+
+    union U32F162
+    {
+        uint32_t u32;
+        fp16x2_t f162;
+    };
+
+    U32F162_ADDR dword_addr;
+    U32F162 cur_v;
+    U32F162 new_;
+    uint32_t old_v, new_v;
+    dword_addr.f162_a = p_dst;
+    cur_v.u32         = *dword_addr.u32_a;
+
+    do
+    {
+        old_v     = cur_v.u32;
+        new_.f162 = add_f16x2_t(cur_v.f162, x);
+        new_v     = new_.u32;
+        cur_v.u32 = atomicCAS(dword_addr.u32_a, old_v, new_v);
+    } while(cur_v.u32 != old_v);
+#endif
+}
+
 template <typename T, index_t N>
 CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
 {
@@ -311,6 +361,7 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
                       (std::is_same<T, uint32_t>::value && (N == 1)) ||
                       (std::is_same<T, float>::value && (N == 1 || N == 2)) ||
                       (std::is_same<T, double>::value && (N == 1 || N == 2)) ||
+                      (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
                       (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
                       (std::is_same<T, fp8_t>::value && (N == 4 || N == 8 || N == 16)) ||
                       (std::is_same<T, bf8_t>::value && (N == 4 || N == 8 || N == 16)),
@@ -406,6 +457,13 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
             atomic_add(c_style_pointer_cast<bf8x8_t*>(p_dst) + 1, x.template get_as<bf8x8_t>()[I1]);
         }
     }
+    else if constexpr(std::is_same<T, fp16_t>::value)
+    {
+        static_for<0, N / 2, 1>{}([&](auto i) {
+            atomic_add(c_style_pointer_cast<fp16x2_t*>(p_dst) + i,
+                       x.template get_as<fp16x2_t>()[i]);
+        });
+    }
 }
 
 template <typename T, index_t N>
diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp
index 7184f99521..93008f8525 100644
--- a/include/ck_tile/core/arch/utility.hpp
+++ b/include/ck_tile/core/arch/utility.hpp
@@ -59,6 +59,21 @@ CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta)
 #endif
 }
 
+template <typename T>
+CK_TILE_DEVICE auto warp_shuffle_down_pair(const T& v_local)
+{
+    static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
+
+    const int32x2_t x = __builtin_amdgcn_permlane32_swap(
+        bit_cast<int32_t>(v_local), bit_cast<int32_t>(v_local), false, false);
+
+    thread_buffer<T, 2> v;
+    v(0) = bit_cast<T>(x[0]);
+    v(1) = bit_cast<T>(x[1]);
+
+    return v;
+}
+
 template <typename T>
 CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
 {
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index c471f416c3..7b5b862cb1 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -152,7 +152,7 @@
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx9__) // for GPU code
+#elif defined(__gfx9__) || defined(__gfx12__) // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
@@ -191,6 +191,16 @@
 #endif
 #endif
 
+// use llvm builtin bf16 data type after ROCm 6.5
+#ifndef CK_TILE_USE_LLVM_BUILTIN_BF16
+#if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 5 && HIP_VERSION_PATCH >= 50421) || \
+    (HIP_VERSION_MAJOR >= 7)
+#define CK_TILE_USE_LLVM_BUILTIN_BF16 1
+#else
+#define CK_TILE_USE_LLVM_BUILTIN_BF16 0
+#endif
+#endif
+
 #ifndef CK_TILE_DEBUG_LOG
 #define CK_TILE_DEBUG_LOG 0
 #endif
diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp
index 6f31468809..245fb7244f 100644
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -6,6 +6,9 @@
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/numeric.hpp"
+#if CK_TILE_USE_LLVM_BUILTIN_BF16
+#include <hip/hip_bfloat16.h>
+#endif
 #include <stdint.h>
 
 #pragma once
@@ -102,7 +105,11 @@ struct native_t<bfloat16_t>
 using bf16_t     = bfloat16_t;
 using bf16_raw_t = typename bf16_t::raw_type;
 #else
+#if CK_TILE_USE_LLVM_BUILTIN_BF16
+using bfloat16_t = __bf16;
+#else
 using bfloat16_t = ushort;
+#endif
 using bf16_t     = bfloat16_t;
 using bf16_raw_t = uint16_t;
 #endif
@@ -280,7 +287,11 @@ template <bf16_rounding_mode rounding =
               static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
 CK_TILE_HOST_DEVICE constexpr bfloat16_t float_to_bf16(float f, constant<rounding> = {})
 {
+#if defined(__gfx950__)
+    return static_cast<bfloat16_t>(f);
+#else
     return bit_cast<bfloat16_t>(float_to_bf16_raw(f, constant<rounding>{}));
+#endif
 }
 
 template <bf16_rounding_mode rounding =
diff --git a/include/ck_tile/core/numeric/e8m0.hpp b/include/ck_tile/core/numeric/e8m0.hpp
index ea94880f27..ba122b7f66 100644
--- a/include/ck_tile/core/numeric/e8m0.hpp
+++ b/include/ck_tile/core/numeric/e8m0.hpp
@@ -87,7 +87,7 @@ CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t::operator float() const
     using traits = numeric_traits<float>;
     if(data == numeric<e8m0_t>::binary_nan)
     {
-        return traits::NaN;
+        return std::numeric_limits<float>::signaling_NaN();
     }
     else if(data == 0)
     {
diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp
index a345cd1b75..7464bc7c48 100644
--- a/include/ck_tile/core/numeric/pk_fp4.hpp
+++ b/include/ck_tile/core/numeric/pk_fp4.hpp
@@ -21,7 +21,7 @@ namespace ck_tile {
 using fp32_t   = float;
 using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
-using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
+using bf16x2_t = bfloat16_t __attribute__((ext_vector_type(2)));
 
 CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float x, float scale = 1.f);
 
diff --git a/include/ck_tile/core/numeric/pk_int4.hpp b/include/ck_tile/core/numeric/pk_int4.hpp
index ba8b87a9b8..0b0eb70beb 100644
--- a/include/ck_tile/core/numeric/pk_int4.hpp
+++ b/include/ck_tile/core/numeric/pk_int4.hpp
@@ -99,7 +99,7 @@ struct numeric_traits<pk_int4_t>
 
 using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
-using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
+using bf16x2_t = bfloat16_t __attribute__((ext_vector_type(2)));
 
 CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t(const pk_int4_t& x)
 {
diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp
index 58bdb43b08..bbd3d53827 100644
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -131,12 +131,12 @@ using fp16x64_t = _Float16 __attribute__((ext_vector_type(64)));
 
 // bf16
 // using bf16_t = ...
-using bf16x2_t  = bf16_raw_t __attribute__((ext_vector_type(2)));
-using bf16x4_t  = bf16_raw_t __attribute__((ext_vector_type(4)));
-using bf16x8_t  = bf16_raw_t __attribute__((ext_vector_type(8)));
-using bf16x16_t = bf16_raw_t __attribute__((ext_vector_type(16)));
-using bf16x32_t = bf16_raw_t __attribute__((ext_vector_type(32)));
-using bf16x64_t = bf16_raw_t __attribute__((ext_vector_type(64)));
+using bf16x2_t  = bfloat16_t __attribute__((ext_vector_type(2)));
+using bf16x4_t  = bfloat16_t __attribute__((ext_vector_type(4)));
+using bf16x8_t  = bfloat16_t __attribute__((ext_vector_type(8)));
+using bf16x16_t = bfloat16_t __attribute__((ext_vector_type(16)));
+using bf16x32_t = bfloat16_t __attribute__((ext_vector_type(32)));
+using bf16x64_t = bfloat16_t __attribute__((ext_vector_type(64)));
 
 // i32
 // using int32_t = ...
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 269465fae6..a85dbc6d00 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -445,6 +445,7 @@ struct null_tensor_view
 };
 
 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
+          memory_operation_enum DstInMemOp      = memory_operation_enum::set,
           amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
           typename DataType,
           typename... Ts>
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index ad5902f16e..f5ddcd278c 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -288,8 +288,11 @@ struct tile_window_with_static_distribution
                 sizeof(LdsDataType) -
             size_per_buf;
 
-        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
-        m0_set_with_memory(m0_init_value); // This should be wave independent
+        // Use VALU so the compiler can optimize redundant/repeated computations
+        const index_t m0_init_value =
+            size_per_buf + size_per_wave * get_warp_id(/*ReturnSgpr=*/bool_constant<false>{});
+        m0_set_with_memory(
+            __builtin_amdgcn_readfirstlane(m0_init_value)); // This should be wave independent
 
         using Traits = typename Base::Traits;
 
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index aa5afd25e5..41f5200413 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -27,6 +27,7 @@
 #include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "ck_tile/host/reference/reference_fused_moe.hpp"
 #include "ck_tile/host/reference/reference_gemm.hpp"
+#include "ck_tile/host/reference/reference_grouped_conv_bwd_data.hpp"
 #include "ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp"
 #include "ck_tile/host/reference/reference_grouped_conv_fwd.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
diff --git a/include/ck_tile/host/device_prop.hpp b/include/ck_tile/host/device_prop.hpp
index 0d8f89ea31..f86e4b889a 100644
--- a/include/ck_tile/host/device_prop.hpp
+++ b/include/ck_tile/host/device_prop.hpp
@@ -52,6 +52,19 @@ inline std::string get_device_name()
     }
 }
 
+inline bool is_gfx11_supported()
+{
+    return get_device_name() == "gfx1100" || get_device_name() == "gfx1101" ||
+           get_device_name() == "gfx1102" || get_device_name() == "gfx1103" ||
+           get_device_name() == "gfx1150" || get_device_name() == "gfx1151" ||
+           get_device_name() == "gfx1152";
+}
+
+inline bool is_gfx12_supported()
+{
+    return get_device_name() == "gfx1200" || get_device_name() == "gfx1201";
+}
+
 inline bool is_load_tr_supported()
 {
     // Check if load transpose is supported.
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 91ac3d5a0b..368a0594c5 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -15,9 +15,9 @@
 
 namespace ck_tile {
 
-template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
+template <int MinBlockPerCu, typename Kernel, typename... Args>
 #if CK_TILE_USE_LAUNCH_BOUNDS
-__launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
+__launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
 #endif
     __global__ void kentry(Args... args)
 {
@@ -35,15 +35,11 @@ __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
 //
 // the "static __device__ operator()(some_arg)" is the entry point of KernelImpl
 //
-template <int MaxThreadPerBlock = CK_TILE_MAX_THREAD_PER_BLOCK,
-          int MinBlockPerCu     = CK_TILE_MIN_BLOCK_PER_CU,
-          typename KernelImpl,
-          typename... Args>
+template <int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU, typename KernelImpl, typename... Args>
 CK_TILE_HOST auto
 make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
 {
-    const auto kernel = kentry<MaxThreadPerBlock, MinBlockPerCu, KernelImpl, Args...>;
-
+    const auto kernel = kentry<MinBlockPerCu, KernelImpl, Args...>;
     return [=](const stream_config& s) {
         kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
     };
diff --git a/include/ck_tile/host/reference/reference_grouped_conv_bwd_data.hpp b/include/ck_tile/host/reference/reference_grouped_conv_bwd_data.hpp
new file mode 100644
index 0000000000..c8264800c9
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_grouped_conv_bwd_data.hpp
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <thread>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+CK_TILE_HOST void reference_grouped_conv_bwd_data(HostTensor<InDataType>& input,
+                                                  const HostTensor<WeiDataType>& weight,
+                                                  const HostTensor<OutDataType>& output,
+                                                  std::vector<ck_tile::long_index_t> conv_strides,
+                                                  std::vector<ck_tile::long_index_t> conv_dilations,
+                                                  std::vector<ck_tile::long_index_t> in_left_pads,
+                                                  std::vector<ck_tile::long_index_t>)
+{
+    if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
+         weight.get_num_of_dimension() == NDimSpatial + 3 &&
+         output.get_num_of_dimension() == NDimSpatial + 3))
+    {
+
+        printf("%lu %lu %lu",
+               input.get_num_of_dimension(),
+               weight.get_num_of_dimension(),
+               output.get_num_of_dimension());
+
+        throw std::runtime_error("wrong! inconsistent dimension");
+    }
+
+    if constexpr(NDimSpatial == 1)
+    {
+        auto func = [&](auto g, auto n, auto c, auto wi) {
+            std::size_t K = weight.get_lengths()[1];
+            std::size_t X = weight.get_lengths()[3];
+
+            std::size_t Wo = output.get_lengths()[3];
+            float v_acc    = 0;
+
+            for(std::size_t x = 0; x < X; ++x)
+            {
+                auto w_tmp = static_cast<ck_tile::long_index_t>(wi) +
+                             static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
+                             static_cast<ck_tile::long_index_t>(x * conv_dilations[0]);
+
+                if(w_tmp % conv_strides[0] == 0)
+                {
+                    auto wo = static_cast<ck_tile::long_index_t>(w_tmp) /
+                              static_cast<ck_tile::long_index_t>(conv_strides[0]);
+
+                    if(wo >= 0 && ck_tile::type_convert<std::size_t>(wo) < Wo)
+                    {
+                        for(std::size_t k = 0; k < K; ++k)
+                        {
+                            OutDataType v_out = output(g, n, k, wo);
+                            WeiDataType v_wei = weight(g, k, c, x);
+                            v_acc += ck_tile::type_convert<float>(v_out) *
+                                     ck_tile::type_convert<float>(v_wei);
+                        }
+                    }
+                }
+            }
+            InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
+            input(g, n, c, wi)         = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   input.get_lengths()[0],
+                                   input.get_lengths()[1],
+                                   input.get_lengths()[2],
+                                   input.get_lengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        auto func = [&](auto g, auto n, auto c, auto hi, auto wi) {
+            std::size_t K = weight.get_lengths()[1];
+            std::size_t Y = weight.get_lengths()[3];
+            std::size_t X = weight.get_lengths()[4];
+
+            std::size_t Ho = output.get_lengths()[3];
+            std::size_t Wo = output.get_lengths()[4];
+
+            float v_acc = 0;
+
+            for(std::size_t y = 0; y < Y; ++y)
+            {
+                auto h_tmp = static_cast<ck_tile::long_index_t>(hi) +
+                             static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
+                             static_cast<ck_tile::long_index_t>(y * conv_dilations[0]);
+                if(h_tmp % conv_strides[0] == 0)
+                {
+                    auto ho = static_cast<ck_tile::long_index_t>(h_tmp) /
+                              static_cast<ck_tile::long_index_t>(conv_strides[0]);
+                    if(ho >= 0 && ck_tile::type_convert<std::size_t>(ho) < Ho)
+                    {
+                        for(std::size_t x = 0; x < X; ++x)
+                        {
+                            auto w_tmp = static_cast<ck_tile::long_index_t>(wi) +
+                                         static_cast<ck_tile::long_index_t>(in_left_pads[1]) -
+                                         static_cast<ck_tile::long_index_t>(x * conv_dilations[1]);
+                            if(w_tmp % conv_strides[1] == 0)
+                            {
+                                auto wo = static_cast<ck_tile::long_index_t>(w_tmp) /
+                                          static_cast<ck_tile::long_index_t>(conv_strides[1]);
+
+                                if(wo >= 0 && ck_tile::type_convert<std::size_t>(wo) < Wo)
+                                {
+                                    for(std::size_t k = 0; k < K; ++k)
+                                    {
+                                        OutDataType v_out = output(g, n, k, ho, wo);
+                                        WeiDataType v_wei = weight(g, k, c, y, x);
+                                        v_acc += ck_tile::type_convert<float>(v_out) *
+                                                 ck_tile::type_convert<float>(v_wei);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
+            input(g, n, c, hi, wi)     = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   input.get_lengths()[0],
+                                   input.get_lengths()[1],
+                                   input.get_lengths()[2],
+                                   input.get_lengths()[3],
+                                   input.get_lengths()[4])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        auto func = [&](auto g, auto n, auto c, auto di, auto hi, auto wi) {
+            std::size_t K = weight.get_lengths()[1];
+            std::size_t Z = weight.get_lengths()[3];
+            std::size_t Y = weight.get_lengths()[4];
+            std::size_t X = weight.get_lengths()[5];
+
+            std::size_t Do = output.get_lengths()[3];
+            std::size_t Ho = output.get_lengths()[4];
+            std::size_t Wo = output.get_lengths()[5];
+
+            float v_acc = 0;
+
+            for(std::size_t z = 0; z < Z; ++z)
+            {
+                auto d_tmp = static_cast<ck_tile::long_index_t>(di) +
+                             static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
+                             static_cast<ck_tile::long_index_t>(z * conv_dilations[0]);
+                if(d_tmp % conv_strides[0] == 0)
+                {
+                    auto do_ = static_cast<ck_tile::long_index_t>(d_tmp) /
+                               static_cast<ck_tile::long_index_t>(conv_strides[0]);
+                    if(do_ >= 0 && ck_tile::type_convert<std::size_t>(do_) < Do)
+                    {
+                        for(std::size_t y = 0; y < Y; ++y)
+                        {
+                            auto h_tmp = static_cast<ck_tile::long_index_t>(hi) +
+                                         static_cast<ck_tile::long_index_t>(in_left_pads[1]) -
+                                         static_cast<ck_tile::long_index_t>(y * conv_dilations[1]);
+                            if(h_tmp % conv_strides[1] == 0)
+                            {
+                                auto ho = static_cast<ck_tile::long_index_t>(h_tmp) /
+                                          static_cast<ck_tile::long_index_t>(conv_strides[1]);
+                                if(ho >= 0 && ck_tile::type_convert<std::size_t>(ho) < Ho)
+                                {
+                                    for(std::size_t x = 0; x < X; ++x)
+                                    {
+                                        auto w_tmp =
+                                            static_cast<ck_tile::long_index_t>(wi) +
+                                            static_cast<ck_tile::long_index_t>(in_left_pads[2]) -
+                                            static_cast<ck_tile::long_index_t>(x *
+                                                                               conv_dilations[2]);
+
+                                        if(w_tmp % conv_strides[2] == 0)
+                                        {
+                                            auto wo =
+                                                static_cast<ck_tile::long_index_t>(w_tmp) /
+                                                static_cast<ck_tile::long_index_t>(conv_strides[2]);
+                                            if(wo >= 0 &&
+                                               ck_tile::type_convert<std::size_t>(wo) < Wo)
+                                            {
+                                                for(std::size_t k = 0; k < K; ++k)
+                                                {
+                                                    OutDataType v_out =
+                                                        output(g, n, k, do_, ho, wo);
+                                                    WeiDataType v_wei = weight(g, k, c, z, y, x);
+                                                    v_acc += ck_tile::type_convert<float>(v_out) *
+                                                             ck_tile::type_convert<float>(v_wei);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
+            input(g, n, c, di, hi, wi) = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   input.get_lengths()[0],
+                                   input.get_lengths()[1],
+                                   input.get_lengths()[2],
+                                   input.get_lengths()[3],
+                                   input.get_lengths()[4],
+                                   input.get_lengths()[5])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Ref_conv_bwd_data: number of dimensions must be between 1 and 3.");
+    }
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
index f06910db3d..c7717f08cd 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/kernel/add_rmsnorm2d_rdquant_fwd_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -53,6 +53,7 @@ struct AddRmsnorm2dRdquantFwd
     static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
     static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
     static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+    static constexpr index_t kBlockSize      = Problem::BlockShape::BlockSize;
 
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
index a4150e8d84..b0f48f6c5b 100644
--- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
@@ -34,6 +34,8 @@ struct BatchedTransposeKernel
 
     using Type = typename Problem::DataType;
 
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
     struct BatchedTransposeKargs
     {
         const void* p_input;
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
index 45803ae2da..b791bf9727 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -20,11 +20,10 @@ struct BatchedTransposeLdsProblem
 
     static constexpr index_t kRowWarps_    = NumWarps::at(number<0>{});
     static constexpr index_t kColWarps_    = NumWarps::at(number<1>{});
-    static constexpr index_t kBlockSize_   = get_warp_size() * kRowWarps_ * kColWarps_;
     static constexpr index_t kRowPerBlock_ = BlockTile::at(number<0>{});
     static constexpr index_t kColPerBlock_ = BlockTile::at(number<1>{});
 
-    static constexpr index_t kBlockSize = kBlockSize_;
+    static constexpr index_t kBlockSize = get_warp_size() * kRowWarps_ * kColWarps_;
     // warps per block
     static constexpr index_t kLeadNumWarps   = kColWarps_;
     static constexpr index_t kSecondNumWarps = kRowWarps_;
diff --git a/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp b/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
index 103468c5fa..2ec9414f42 100644
--- a/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
+++ b/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
@@ -20,6 +20,8 @@ struct ElementWiseKernel
     using YDataType            = ck_tile::remove_cvref_t<typename Problem::YDataType>;
     using ElementWiseOperation = ck_tile::remove_cvref_t<typename Problem::ElementWiseOperation>;
 
+    static constexpr index_t kBlockSize = Problem::BlockShape::kBlockSize;
+
     template <typename... XDataType, typename Dims>
     CK_TILE_DEVICE void operator()(Dims lens,
                                    Dims input_strides,
diff --git a/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp b/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp
index 0d25a8a202..aaad6407d4 100644
--- a/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp
+++ b/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp
@@ -14,13 +14,14 @@ struct ElementWiseShape
 
     static constexpr index_t kWarpM = WarpTile::at(number<0>{});
 
-    static constexpr index_t kVectorM = 16 / sizeof(ComputeDataType);
+    static constexpr index_t kVectorM =
+        min(static_cast<index_t>(16 / sizeof(ComputeDataType)), kWarpM / get_warp_size());
 
     static constexpr index_t kWarpPerBlockM = BlockWarps::at(number<0>{});
 
-    static constexpr index_t kThreadPerWarpM = kWarpM / kVectorM;
+    static constexpr index_t kThreadPerWarpM = get_warp_size();
 
-    static constexpr index_t kRepeatM = kBlockM / (kWarpPerBlockM * kWarpM);
+    static constexpr index_t kRepeatM = kBlockM / (kWarpPerBlockM * kVectorM * kThreadPerWarpM);
 
     static constexpr index_t kBlockSize =
         ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index 0e385901ed..2f8cef7afd 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -262,219 +262,67 @@ struct PassThroughPack2
 
 struct PassThrough
 {
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
+    template <class T>
+    using raw_t = std::remove_cv_t<std::remove_reference_t<T>>;
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<double, double>(double& y, const double& x) const
+    template <class Y, class X>
+    CK_TILE_HOST_DEVICE void operator()(Y&& y, const X& x) const
     {
-        y = x;
+        /*  Only do the assignment when
+            - y is an *l-value*   and
+            - y is *not* const     */
+        if constexpr(std::is_lvalue_reference_v<Y&&> && !std::is_const_v<raw_t<Y>>)
+        {
+            y = ck_tile::type_convert<raw_t<Y>>(x);
+        }
+        /*  otherwise (r-value or const)     → do nothing  */
     }
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, double>(float& y, const double& x) const
+    template <typename E, typename C, typename... Ds>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const Ds&... ds) const -> void
     {
-        y = type_convert<float>(x);
-    }
+        // Suppress unused parameter warning for ds
+        ((void)ds, ...);
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<double, float>(double& y, const float& x) const
-    {
-        y = type_convert<double>(x);
+        // Just assign e with c
+        if constexpr(std::is_same_v<E, C>)
+        {
+            e = c;
+        }
+        else
+        {
+            e = ck_tile::type_convert<E>(c);
+        }
     }
+};
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, float>(float& y, const float& x) const
+struct MultiDMultiply
+{
+    template <typename E, typename C, typename... Ds>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const Ds&... ds) const -> void
     {
-        y = x;
-    }
+        // Start with the base value c
+        float result = ck_tile::type_convert<float>(c);
 
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp16_t, ck_tile::fp16_t>(ck_tile::fp16_t& y, const ck_tile::fp16_t& x) const
-    {
-        y = x;
-    }
+        // Multiply by each D parameter using fold expression
+        ((result *= ck_tile::type_convert<float>(ds)), ...);
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp16_t, float>(ck_tile::fp16_t& y,
-                                                                const float& x) const
-    {
-        y = type_convert<ck_tile::fp16_t>(x);
+        e = ck_tile::type_convert<E>(result);
     }
+};
 
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::bf16_t, ck_tile::bf16_t>(ck_tile::bf16_t& y, const ck_tile::bf16_t& x) const
+struct MultiDAdd
+{
+    template <typename E, typename C, typename... Ds>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const Ds&... ds) const -> void
     {
-        y = x;
-    }
+        // Start with the base value c
+        float result = ck_tile::type_convert<float>(c);
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int32_t, int32_t>(int32_t& y, const int32_t& x) const
-    {
-        y = x;
-    }
+        // Add by each D parameter using fold expression
+        ((result += ck_tile::type_convert<float>(ds)), ...);
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::bf16_t, float>(ck_tile::bf16_t& y,
-                                                                const float& x) const
-    {
-        y = type_convert<ck_tile::bf16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::bf16_t>(float& y,
-                                                                const ck_tile::bf16_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::bf16_t, ck_tile::fp16_t>(ck_tile::bf16_t& y, const ck_tile::fp16_t& x) const
-    {
-        y = type_convert<ck_tile::bf16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::fp16_t>(float& y,
-                                                                const ck_tile::fp16_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp16_t, int8_t>(ck_tile::fp16_t& y,
-                                                                 const int8_t& x) const
-    {
-        y = type_convert<ck_tile::fp16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::bf16_t, int8_t>(ck_tile::bf16_t& y,
-                                                                 const int8_t& x) const
-    {
-        y = type_convert<ck_tile::bf16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<uint8_t, uint8_t>(uint8_t& y, const uint8_t& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int8_t, int32_t>(int8_t& y, const int32_t& x) const
-    {
-        y = type_convert<int8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int32_t, int8_t>(int32_t& y, const int8_t& x) const
-    {
-        y = type_convert<int32_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int8_t, float>(int8_t& y, const float& x) const
-    {
-        y = type_convert<int8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, int8_t>(float& y, const int8_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int4_t, int4_t>(int4_t& y, const int4_t& x) const
-    {
-        y = x;
-    }
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int4_t, int>(int4_t& y, const int& x) const
-    {
-        y = type_convert<int4_t>(x);
-    }
-#endif
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp8_t, ck_tile::fp8_t>(ck_tile::fp8_t& y, const ck_tile::fp8_t& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::fp8_t>(float& y,
-                                                               const ck_tile::fp8_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp8_t, float>(ck_tile::fp8_t& y,
-                                                               const float& x) const
-    {
-        y = type_convert<ck_tile::fp8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp16_t, ck_tile::fp8_t>(ck_tile::fp16_t& y, const ck_tile::fp8_t& x) const
-    {
-        y = type_convert<ck_tile::fp16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp8_t, ck_tile::fp16_t>(ck_tile::fp8_t& y, const ck_tile::fp16_t& x) const
-    {
-        y = type_convert<ck_tile::fp8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::bf8_t, ck_tile::bf8_t>(ck_tile::bf8_t& y, const ck_tile::bf8_t& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::bf8_t>(float& y,
-                                                               const ck_tile::bf8_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::bf8_t, float>(ck_tile::bf8_t& y,
-                                                               const float& x) const
-    {
-        y = type_convert<ck_tile::bf8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp16_t, ck_tile::bf8_t>(ck_tile::fp16_t& y, const ck_tile::bf8_t& x) const
-    {
-        y = type_convert<ck_tile::fp16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::bf8_t, ck_tile::fp16_t>(ck_tile::bf8_t& y, const ck_tile::fp16_t& x) const
-    {
-        y = ck_tile::type_convert<ck_tile::bf8_t>(x);
+        e = ck_tile::type_convert<E>(result);
     }
 };
 
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index d42f144baa..1d0a4c42f4 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -17,7 +17,6 @@ template <typename ADataType_,
           typename DsLayout_,
           typename ELayout_,
           typename CDElementwise_,
-          index_t kBlockSize_,
           index_t kM_,
           index_t kN_,
           index_t MWave_,
@@ -40,7 +39,7 @@ struct CShuffleEpilogueProblem
     using DsLayout                                         = remove_cvref_t<DsLayout_>;
     using ELayout                                          = remove_cvref_t<ELayout_>;
     using CDElementwise                                    = remove_cvref_t<CDElementwise_>;
-    static constexpr index_t kBlockSize                    = kBlockSize_;
+    static constexpr index_t kBlockSize                    = MWave_ * NWave_ * get_warp_size();
     static constexpr index_t kMPerBlock                    = kM_;
     static constexpr index_t kNPerBlock                    = kN_;
     static constexpr index_t MWave                         = MWave_;
@@ -203,13 +202,13 @@ struct CShuffleEpilogue
     static constexpr index_t MPerIterationShuffle = std::get<0>(MNPerIterationShuffle);
     static constexpr index_t NPerIterationShuffle = std::get<1>(MNPerIterationShuffle);
 
-    using WG = WarpGemmMfmaDispatcher<ATypeToUse,
-                                      BTypeToUse,
-                                      AccDataType,
-                                      MPerXdl,
-                                      NPerXdl,
-                                      KPerXdl,
-                                      isCTransposed>;
+    using WG = WarpGemmDispatcher<ATypeToUse,
+                                  BTypeToUse,
+                                  AccDataType,
+                                  MPerXdl,
+                                  NPerXdl,
+                                  KPerXdl,
+                                  isCTransposed>;
 
     using CWarpDstr   = typename WG::CWarpDstr;
     using CWarpTensor = typename WG::CWarpTensor;
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index fdbe2e7a6d..8a0970f494 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -130,13 +130,13 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
     static constexpr index_t kKPerXdl      = Problem::kKPerXdl;
     static constexpr index_t isCTransposed = Problem::isCTransposed;
 
-    using WG = WarpGemmMfmaDispatcher<ADataType,
-                                      BTypeToUse,
-                                      AccDataType,
-                                      kMPerXdl,
-                                      kNPerXdl,
-                                      kKPerXdl,
-                                      isCTransposed>;
+    using WG = WarpGemmDispatcher<ADataType,
+                                  BTypeToUse,
+                                  AccDataType,
+                                  kMPerXdl,
+                                  kNPerXdl,
+                                  kKPerXdl,
+                                  isCTransposed>;
 
     using CWarpDstr = typename WG::CWarpDstr;
 
diff --git a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
index 23c4ad583e..21ca470222 100644
--- a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
+++ b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
@@ -63,48 +63,15 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
     static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 8
     static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 8/2=4
 
-    static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
+    private:
+    template <index_t LanesPerK, index_t WarpSize, typename = void>
+    struct LdsStoreDescSelector;
+
+    template <index_t LanesPerK, index_t WarpSize>
+    struct LdsStoreDescSelector<LanesPerK, WarpSize, std::enable_if_t<(LanesPerK >= WarpSize)>>
     {
-        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_N, WarpPerBlock_N>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<2, 1>, // !! note here is different
-            sequence<0, 0>>{};
-
-        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>;
-
-        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
-        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
-        return c_block_dstr;
-    }
-
-    static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
-    {
-        using CDataType             = float;
-        constexpr auto c_block_dstr = MakeCBlockDist();
-        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
-        return c_block_tensor;
-    }
-
-    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A()
-    {
-        // A async->LDS
-        // constexpr index_t Block_M = Problem::BlockShape::Block_M0;
-        // constexpr index_t Block_K = Problem::BlockShape::Block_K0;
-        // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
-        constexpr index_t WarpSize = ck_tile::get_warp_size();
-        // constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
-
-        constexpr index_t KPack_  = 8;      // GetSmemKPack_A<Problem>(); // LDS
-        constexpr index_t KVector = 2;      // GetAlignment_A<Problem>(); // async copy 1 dword
-        constexpr index_t KPad    = KPack_; // pad between warps
-
-        static_assert(Block_K % KVector == 0);
-        constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
-        if constexpr(LanesPerK >= WarpSize)
+        template <index_t NumWarps, index_t Block_M, index_t Block_K, index_t KVector, index_t KPad>
+        static CK_TILE_HOST_DEVICE constexpr auto MakeDesc()
         {
             // need multiple waves to load K
             static_assert(LanesPerK % WarpSize == 0);
@@ -143,7 +110,13 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
                 return lds_block_desc_issues_warps_lanes;
             }
         }
-        else
+    };
+
+    template <index_t LanesPerK, index_t WarpSize>
+    struct LdsStoreDescSelector<LanesPerK, WarpSize, std::enable_if_t<(LanesPerK < WarpSize)>>
+    {
+        template <index_t NumWarps, index_t Block_M, index_t Block_K, index_t KVector, index_t KPad>
+        static CK_TILE_HOST_DEVICE constexpr auto MakeDesc()
         {
             // lanes within a wave load different M but same K
             static_assert(WarpSize % LanesPerK == 0);
@@ -175,6 +148,49 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
 
             return lds_block_desc_issues_warps_lanes;
         }
+    };
+
+    public:
+    static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
+    {
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_N, WarpPerBlock_N>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<2, 1>, // !! note here is different
+            sequence<0, 0>>{};
+
+        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>;
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        return c_block_dstr;
+    }
+
+    static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
+    {
+        using CDataType             = float;
+        constexpr auto c_block_dstr = MakeCBlockDist();
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A()
+    {
+        // A async->LDS
+        constexpr index_t WarpSize = ck_tile::get_warp_size();
+
+        constexpr index_t KPack_  = 8;      // GetSmemKPack_A<Problem>(); // LDS
+        constexpr index_t KVector = 2;      // GetAlignment_A<Problem>(); // async copy 1 dword
+        constexpr index_t KPad    = KPack_; // pad between warps
+
+        static_assert(Block_K % KVector == 0);
+        constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
+
+        return LdsStoreDescSelector<LanesPerK, WarpSize>::
+            template MakeDesc<NumWarps, Block_M, Block_K, KVector, KPad>();
     }
 
     // template <typename Problem>
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
index 76df056ea6..20ca976590 100644
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -91,13 +91,13 @@ struct FlatmmKernel
     using FlatmmPipeline  = remove_cvref_t<FlatmmPipeline_>;
     using BlockGemmShape =
         remove_cvref_t<typename FlatmmPipeline::BlockGemmShape>; // TileFlatmmShape
-    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
-    using ALayout          = remove_cvref_t<typename FlatmmPipeline::ALayout>;
-    using BLayout          = remove_cvref_t<typename FlatmmPipeline::BLayout>;
-    using ELayout          = remove_cvref_t<typename FlatmmPipeline::CLayout>;
-    using DsLayout         = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
-    using DsDataType       = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
-    static constexpr index_t KernelBlockSize = FlatmmPipeline::BlockSize;
+    using EpiloguePipeline              = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout                       = remove_cvref_t<typename FlatmmPipeline::ALayout>;
+    using BLayout                       = remove_cvref_t<typename FlatmmPipeline::BLayout>;
+    using ELayout                       = remove_cvref_t<typename FlatmmPipeline::CLayout>;
+    using DsLayout                      = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
+    using DsDataType                    = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
+    static constexpr index_t kBlockSize = FlatmmPipeline::BlockSize;
 
     using ADataType = remove_cvref_t<typename FlatmmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename FlatmmPipeline::BDataType>;
@@ -127,7 +127,7 @@ struct FlatmmKernel
         return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
 
     CK_TILE_HOST static constexpr KernelArgs
     MakeKernelArgs(const FlatmmHostArgs<NumDTensor>& hostArgs)
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index cc00000efc..3ca79fc46e 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -237,15 +237,16 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetKBPerLoad()
     {
-        using TileShape = typename Problem::BlockGemmShape;
+        using TileShape         = typename Problem::BlockGemmShape;
+        constexpr index_t scale = get_warp_size() == 32 ? 2 : 1;
         if constexpr(TileShape::WarpTile::at(I1) == 32)
         {
-            return TileShape::WarpTile::at(I2) / 2;
+            return TileShape::WarpTile::at(I2) * scale / 2;
         }
         else
         {
             static_assert(TileShape::WarpTile::at(I1) == 16);
-            return TileShape::WarpTile::at(I2) / 4;
+            return TileShape::WarpTile::at(I2) * scale / 4;
         }
     }
 
@@ -430,13 +431,13 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         // using AccDataType = float;
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm   = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                  typename Problem::BDataType,
-                                                  typename Problem::CDataType,
-                                                  WarpTile::at(I0),
-                                                  WarpTile::at(I1),
-                                                  WarpTile::at(I2),
-                                                  Problem::TransposeC>;
+        using WarpGemm   = WarpGemmDispatcher<typename Problem::ADataType,
+                                              typename Problem::BDataType,
+                                              typename Problem::CDataType,
+                                              WarpTile::at(I0),
+                                              WarpTile::at(I1),
+                                              WarpTile::at(I2),
+                                              Problem::TransposeC>;
 
         using BlockFlatmmPolicy = BlockFlatmmASmemBSmemCRegV1CustomPolicy<
             typename Problem::ADataType,
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index d8dd5db12e..16fde15c7b 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -45,6 +45,8 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
index 0d0959ba27..2850ce3379 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -24,9 +24,10 @@ namespace ck_tile {
 template <typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaBatchPrefillWithPagedKVCacheKernel
 {
-    using FmhaPipeline                            = ck_tile::remove_cvref_t<FmhaPipeline_>;
-    using EpiloguePipeline                        = ck_tile::remove_cvref_t<EpiloguePipeline_>;
-    static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
+    using FmhaPipeline                           = ck_tile::remove_cvref_t<FmhaPipeline_>;
+    using EpiloguePipeline                       = ck_tile::remove_cvref_t<EpiloguePipeline_>;
+    static constexpr ck_tile::index_t kBlockSize = FmhaPipeline::kBlockSize;
+
     static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
     static_assert(kBlockPerCu > 0);
     static constexpr ck_tile::index_t kBlockPerCuInput = FmhaPipeline::Problem::kBlockPerCu;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
index 9fec9a320c..66f51459af 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,6 +16,7 @@ struct FmhaFwdAppendKVKernel
     using FmhaPipeline                            = ck_tile::remove_cvref_t<FmhaPipeline_>;
     static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
     static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
+
     static_assert(kBlockPerCu > 0);
     static constexpr ck_tile::index_t kBlockPerCuInput = FmhaPipeline::Problem::kBlockPerCu;
 
@@ -647,44 +648,29 @@ struct FmhaFwdAppendKVKernel
                              make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kN0>{}),
                              {0, i_n0});
 
-        if constexpr(kApplyRoPE)
-        {
-            FmhaPipeline{}(q_dram_window,
-                           k_dram_window,
-                           i_page_block_k,
-                           k_page_block_navigator,
-                           knew_dram_window,
-                           v_dram_window,
-                           i_page_block_v,
-                           v_page_block_navigator,
-                           vnew_dram_window,
-                           q_rotary_cos_dram_window,
-                           q_rotary_sin_dram_window,
-                           knew_rotary_cos_dram_window,
-                           knew_rotary_sin_dram_window,
-                           kargs.rotary_dim,
-                           kargs.seqlen_q <= i_m0,
-                           skip_append_kv);
-        }
-        else
-        {
-            FmhaPipeline{}(q_dram_window,
-                           k_dram_window,
-                           i_page_block_k,
-                           k_page_block_navigator,
-                           knew_dram_window,
-                           v_dram_window,
-                           i_page_block_v,
-                           v_page_block_navigator,
-                           vnew_dram_window,
-                           q_rotary_cos_dram_window,
-                           q_rotary_sin_dram_window,
-                           knew_rotary_cos_dram_window,
-                           knew_rotary_sin_dram_window,
-                           0, // rotary_dim not used
-                           kargs.seqlen_q <= i_m0,
-                           skip_append_kv);
-        }
+        // If kApplyRoPe is false, we set the rotary_dim to 0
+        auto rotary_dim = [&]() {
+            if constexpr(kApplyRoPE)
+                return kargs.rotary_dim;
+            else
+                return 0;
+        }();
+        FmhaPipeline{}(q_dram_window,
+                       k_dram_window,
+                       i_page_block_k,
+                       k_page_block_navigator,
+                       knew_dram_window,
+                       v_dram_window,
+                       i_page_block_v,
+                       v_page_block_navigator,
+                       vnew_dram_window,
+                       q_rotary_cos_dram_window,
+                       q_rotary_sin_dram_window,
+                       knew_rotary_cos_dram_window,
+                       knew_rotary_sin_dram_window,
+                       rotary_dim,
+                       kargs.seqlen_q <= i_m0,
+                       skip_append_kv);
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 8d257a3329..6d35afaa26 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -13,6 +13,7 @@
 #include <utility>
 #include <variant>
 
+#define CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD 0
 // S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] @ K[seqlen_k, hdim_q]
 // S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
 // S''[seqlen_q, seqlen_k] = S'[seqlen_q, seqlen_k] + Bias[seqlen_q, seqlen_k]
@@ -24,9 +25,10 @@ namespace ck_tile {
 template <typename FmhaPipeline_, typename EpiloguePipeline_>
 struct FmhaFwdKernel
 {
-    using FmhaPipeline                            = ck_tile::remove_cvref_t<FmhaPipeline_>;
-    using EpiloguePipeline                        = ck_tile::remove_cvref_t<EpiloguePipeline_>;
-    static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
+    using FmhaPipeline                           = ck_tile::remove_cvref_t<FmhaPipeline_>;
+    using EpiloguePipeline                       = ck_tile::remove_cvref_t<EpiloguePipeline_>;
+    static constexpr ck_tile::index_t kBlockSize = FmhaPipeline::kBlockSize;
+
     static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
     static_assert(kBlockPerCu > 0);
     static constexpr ck_tile::index_t kBlockPerCuInput = FmhaPipeline::Problem::kBlockPerCu;
@@ -61,6 +63,14 @@ struct FmhaFwdKernel
 
     static constexpr bool kUseAsyncCopy = FmhaPipeline::Policy::AsyncCopy;
 
+    static constexpr bool kUseTrLoad = FmhaPipeline::Problem::kUseTrLoad;
+#if defined(__gfx950__)
+    static constexpr bool kIsAvialable = true;
+#else
+    static constexpr bool kIsAvialable = !kUseTrLoad;
+#endif
+    static constexpr std::string_view kPipelineName = FmhaPipeline::name;
+
     // clang-format off
     template <typename T> struct t2s;
     template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
@@ -100,7 +110,7 @@ struct FmhaFwdKernel
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
             (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kSkipMinSeqlenQ ? "_skip" : "_nskip" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kSkipMinSeqlenQ ? "_skip" : "_nskip" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" ) + (kUseTrLoad ? "_trload" : "_ntrload");
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -1036,455 +1046,1142 @@ struct FmhaFwdKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
+        if constexpr(kIsAvialable)
+            run_(std::move(kargs));
+    }
 
-        // divide problem
-        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
-
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
-
-        long_index_t batch_offset_q       = 0;
-        long_index_t batch_offset_k       = 0;
-        long_index_t batch_offset_v       = 0;
-        long_index_t batch_offset_bias    = 0;
-        long_index_t batch_offset_randval = 0;
-        long_index_t batch_offset_lse     = 0;
-        long_index_t batch_offset_o       = 0;
-
-        if constexpr(kIsGroupMode)
+    CK_TILE_DEVICE void run_(Kargs kargs) const
+    {
+        if constexpr(kPipelineName != "qr_async_trload")
         {
-            // get starting offset for each batch
-            const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-            const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+            // allocate LDS
+            __shared__ char smem_ptr[GetSmemSize()];
 
-            batch_offset_q = query_start * kargs.stride_q;
-            batch_offset_k = key_start * kargs.stride_k;
-            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                batch_offset_v = key_start * kargs.stride_v;
-            }
-            else
-            {
-                batch_offset_v = key_start;
-            }
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-            {
-                batch_offset_bias = query_start * kargs.stride_bias;
-            }
-            if constexpr(kStoreLSE)
-            {
-                batch_offset_lse = query_start;
-            }
-            if constexpr(kHasDropout)
-            {
-                batch_offset_randval = query_start * kargs.stride_randval;
-            }
-            batch_offset_o = query_start * kargs.stride_o;
+            // divide problem
+            const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-            // get real # queries & # keys under group mode
-            const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
-            kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+            const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+            const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
 
-            if constexpr(kSkipMinSeqlenQ)
+            long_index_t batch_offset_q       = 0;
+            long_index_t batch_offset_k       = 0;
+            long_index_t batch_offset_v       = 0;
+            long_index_t batch_offset_bias    = 0;
+            long_index_t batch_offset_randval = 0;
+            long_index_t batch_offset_lse     = 0;
+            long_index_t batch_offset_o       = 0;
+
+            if constexpr(kIsGroupMode)
             {
-                if(kargs.seqlen_q <= kargs.min_seqlen_q)
+                // get starting offset for each batch
+                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+
+                batch_offset_q = query_start * kargs.stride_q;
+                batch_offset_k = key_start * kargs.stride_k;
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    batch_offset_v = key_start * kargs.stride_v;
+                }
+                else
+                {
+                    batch_offset_v = key_start;
+                }
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias = query_start * kargs.stride_bias;
+                }
+                if constexpr(kStoreLSE)
+                {
+                    batch_offset_lse = query_start;
+                }
+                if constexpr(kHasDropout)
+                {
+                    batch_offset_randval = query_start * kargs.stride_randval;
+                }
+                batch_offset_o = query_start * kargs.stride_o;
+
+                // get real # queries & # keys under group mode
+                const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
+                kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+
+                if constexpr(kSkipMinSeqlenQ)
+                {
+                    if(kargs.seqlen_q <= kargs.min_seqlen_q)
+                    {
+                        return;
+                    }
+                }
+
+                // # of required blocks is different in each groups, terminate unnecessary blocks
+                // earlier
+                if(kargs.seqlen_q <= i_m0)
                 {
                     return;
                 }
-            }
 
-            // # of required blocks is different in each groups, terminate unnecessary blocks
-            // earlier
-            if(kargs.seqlen_q <= i_m0)
-            {
-                return;
-            }
-
-            if(kargs.seqlen_k_ptr != nullptr)
-            {
-                kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+                if(kargs.seqlen_k_ptr != nullptr)
+                {
+                    kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+                }
+                else
+                {
+                    const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
+                    kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+                }
             }
             else
             {
-                const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
-                kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+                batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias =
+                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+                }
+                if constexpr(kStoreLSE)
+                {
+                    batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+                }
+                if constexpr(kHasDropout)
+                {
+                    batch_offset_randval =
+                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
+                }
+                batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
             }
-        }
-        else
-        {
-            batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
-            batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
-            batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-            {
-                batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
-            }
-            if constexpr(kStoreLSE)
-            {
-                batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
-            }
-            if constexpr(kHasDropout)
-            {
-                batch_offset_randval =
-                    static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
-            }
-            batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
-        }
 
-        // for simplicity, batch stride we just modify the pointer
-        const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
-                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
-                                 batch_offset_q;
-        const KDataType* k_ptr =
-            reinterpret_cast<const KDataType*>(kargs.k_ptr) +
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
-            batch_offset_k;
-        const VDataType* v_ptr =
-            reinterpret_cast<const VDataType*>(kargs.v_ptr) +
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
-            batch_offset_v;
-        ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
-                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
-                           batch_offset_o;
+            // for simplicity, batch stride we just modify the pointer
+            const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                     batch_offset_q;
+            const KDataType* k_ptr =
+                reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+                static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
+                batch_offset_k;
+            const VDataType* v_ptr =
+                reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+                static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
+                batch_offset_v;
+            ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                               batch_offset_o;
 
-        // Q/K/V DRAM and DRAM window
-        const auto q_dram = [&]() {
-            const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                q_ptr,
-                make_tuple(kargs.seqlen_q, kargs.hdim_q),
-                make_tuple(kargs.stride_q, 1),
-                number<FmhaPipeline::kAlignmentQ>{},
-                number<1>{});
-            if constexpr(FmhaPipeline::kQLoadOnce)
-            {
-                return pad_tensor_view(
-                    q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-            }
-            else
-            {
-                return pad_tensor_view(
-                    q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-            }
-        }();
-        const auto k_dram = [&]() {
-            const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                k_ptr,
-                make_tuple(kargs.seqlen_k, kargs.hdim_q),
-                make_tuple(kargs.stride_k, 1),
-                number<FmhaPipeline::kAlignmentK>{},
-                number<1>{});
-
-            constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
-            return pad_tensor_view(
-                k_dram_naive,
-                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<kPadSeqLenK_, kPadHeadDimQ>{});
-        }();
-        const auto v_dram = [&]() {
-            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    v_ptr,
-                    make_tuple(kargs.seqlen_k, kargs.hdim_v),
-                    make_tuple(kargs.stride_v, 1),
-                    number<FmhaPipeline::kAlignmentV>{},
+            // Q/K/V DRAM and DRAM window
+            const auto q_dram = [&]() {
+                const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    q_ptr,
+                    make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                    make_tuple(kargs.stride_q, 1),
+                    number<FmhaPipeline::kAlignmentQ>{},
+                    number<1>{});
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                {
+                    return pad_tensor_view(q_dram_naive,
+                                           make_tuple(number<FmhaPipeline::kM0>{},
+                                                      number<FmhaPipeline::kSubQKHeaddim>{}),
+                                           sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                }
+                else
+                {
+                    return pad_tensor_view(
+                        q_dram_naive,
+                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                        sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                }
+            }();
+            const auto k_dram = [&]() {
+                const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    k_ptr,
+                    make_tuple(kargs.seqlen_k, kargs.hdim_q),
+                    make_tuple(kargs.stride_k, 1),
+                    number<FmhaPipeline::kAlignmentK>{},
                     number<1>{});
-
-                const auto v_dram_transposed =
-                    transform_tensor_view(v_dram_naive,
-                                          make_tuple(make_pass_through_transform(kargs.hdim_v),
-                                                     make_pass_through_transform(kargs.seqlen_k)),
-                                          make_tuple(sequence<1>{}, sequence<0>{}),
-                                          make_tuple(sequence<0>{}, sequence<1>{}));
 
                 constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
                 return pad_tensor_view(
-                    v_dram_transposed,
-                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK_>{});
+                    k_dram_naive,
+                    make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                    sequence<kPadSeqLenK_, kPadHeadDimQ>{});
+            }();
+            const auto v_dram = [&]() {
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        v_ptr,
+                        make_tuple(kargs.seqlen_k, kargs.hdim_v),
+                        make_tuple(kargs.stride_v, 1),
+                        number<FmhaPipeline::kAlignmentV>{},
+                        number<1>{});
+
+                    const auto v_dram_transposed = transform_tensor_view(
+                        v_dram_naive,
+                        make_tuple(make_pass_through_transform(kargs.hdim_v),
+                                   make_pass_through_transform(kargs.seqlen_k)),
+                        make_tuple(sequence<1>{}, sequence<0>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+                    constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
+                    return pad_tensor_view(
+                        v_dram_transposed,
+                        make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                        sequence<kPadHeadDimV, kPadSeqLenK_>{});
+                }
+                else
+                {
+                    const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        v_ptr,
+                        make_tuple(kargs.hdim_v, kargs.seqlen_k),
+                        make_tuple(kargs.stride_v, 1),
+                        number<FmhaPipeline::kAlignmentV>{},
+                        number<1>{});
+
+                    constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
+                    return pad_tensor_view(
+                        v_dram_naive,
+                        make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                        sequence<kPadHeadDimV_, kPadSeqLenK>{});
+                }
+            }();
+
+            auto q_dram_window = make_tile_window(
+                q_dram,
+                [&]() {
+                    if constexpr(FmhaPipeline::kQLoadOnce)
+                        return make_tuple(number<FmhaPipeline::kM0>{},
+                                          number<FmhaPipeline::kSubQKHeaddim>{});
+                    else
+                        return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
+                }(),
+                {i_m0, 0});
+
+            auto k_dram_window = make_tile_window(
+                k_dram,
+                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                {0, 0});
+
+            auto v_dram_window = make_tile_window(
+                v_dram,
+                make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                {i_n1, 0});
+            /// FIXME: Before C++20, capturing structured binding variables are not supported.
+            /// Remove following copy capture of the 'i_nhead' if in C++20
+            const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto bias_dram_window_lengths =
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    const BiasDataType* bias_ptr =
+                        reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                        batch_offset_bias;
+
+                    const auto bias_dram = [&]() {
+                        const auto bias_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                bias_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_bias, 1),
+                                number<FmhaPipeline::kAlignmentBias>{},
+                                number<1>{});
+
+                        return pad_tensor_view(bias_dram_naive,
+                                               bias_dram_window_lengths,
+                                               sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    }();
+
+                    return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+                }
+                else
+                {
+                    return make_null_tile_window(bias_dram_window_lengths);
+                }
+            }();
+
+            // lse
+            auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
+                if constexpr(kStoreLSE)
+                {
+                    LSEDataType* lse_ptr =
+                        reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
+                        batch_offset_lse;
+
+                    const auto lse_dram = [&]() {
+                        const auto lse_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                lse_ptr,
+                                make_tuple(kargs.seqlen_q),
+                                make_tuple(1),
+                                number<1>{},
+                                number<1>{});
+
+                        return pad_tensor_view(
+                            lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
+                    }();
+
+                    return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+                }
+                else
+                {
+                    return make_null_tile_window(lse_dram_window_lengths);
+                }
+            }();
+
+            auto dropout = [&, i_nhead_ = i_nhead, i_batch_ = i_batch]() {
+                if constexpr(kHasDropout)
+                {
+                    return BlockDropout{i_batch_,
+                                        i_nhead_,
+                                        kargs.num_head_q,
+                                        kargs.is_drop_seed_offset_from_host ? kargs.drop_seed.val
+                                                                            : *kargs.drop_seed.ptr,
+                                        kargs.is_drop_seed_offset_from_host
+                                            ? kargs.drop_offset.val
+                                            : *kargs.drop_offset.ptr,
+                                        kargs.rp_undrop,
+                                        kargs.p_undrop_in_uint8_t,
+                                        kargs.is_store_randval};
+                }
+                else
+                {
+                    return NullBlockDropout{};
+                };
+            }();
+
+            auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto randval_dram_window_lengths =
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+                if constexpr(kHasDropout)
+                {
+                    RandValOutputDataType* rand_val_ptr =
+                        reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
+                        batch_offset_randval;
+
+                    const auto randval_dram = [&]() {
+                        const auto randval_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                rand_val_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_randval, 1),
+                                number<1>{},
+                                number<1>{});
+
+                        return pad_tensor_view(randval_dram_naive,
+                                               randval_dram_window_lengths,
+                                               sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    }();
+
+                    return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
+                }
+                else
+                {
+                    return make_null_tile_window(randval_dram_window_lengths);
+                }
+            }();
+
+            FmhaMask mask = [&]() {
+                if constexpr(kHasMask)
+                    return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
+                        kargs.window_size_left,
+                        kargs.window_size_right,
+                        kargs.seqlen_q,
+                        kargs.seqlen_k,
+                        kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
+                else
+                    return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
+            }();
+
+            // WA i_batch capture structure binding before c++20
+            auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    // data loading, shared by entire wg
+                    // TODO: how to use s_read?
+                    SaccDataType slope =
+                        *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
+                          i_batch_ * kargs.alibi_slope_stride + i_nhead_);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    slope *= ck_tile::log2e_v<>;
+#endif
+                    if constexpr(kHasMask)
+                    {
+                        return make_alibi_from_lr_mask<SaccDataType, true>(slope,
+                                                                           kargs.window_size_left,
+                                                                           kargs.window_size_right,
+                                                                           kargs.seqlen_q,
+                                                                           kargs.seqlen_k,
+                                                                           kargs.mask_type);
+                    }
+                    else
+                    {
+                        return Alibi<SaccDataType, true>{
+                            slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                    }
+                }
+                else
+                {
+                    return EmptyPositionEncoding<SaccDataType>{};
+                }
+            }();
+
+            AttentionVariant variant;
+            const auto variant_params = [&] {
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
+                        mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
+                }
+                else
+                {
+                    return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
+                }
+            }();
+
+            BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
+
+            auto o_acc_tile = [&]() {
+                if constexpr(kDoFp8StaticQuant)
+                {
+                    return FmhaPipeline{}(
+                        q_dram_window,
+                        identity{}, // q_element_func
+                        k_dram_window,
+                        identity{}, // k_element_func
+                        v_dram_window,
+                        identity{}, // v_element_func
+                        bias_dram_window,
+                        identity{}, // bias_element_func
+                        randval_dram_window,
+                        lse_dram_window,
+                        identity{},            // lse_element_func
+                        identity{},            // s_acc_element_func
+                        scales{kargs.scale_p}, // p_compute_element_func
+                        composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
+                        mask,
+                        position_encoding,
+                        kargs.scale_s,
+                        variant,
+                        variant_params,
+                        block_indices,
+                        smem_ptr,
+                        dropout);
+                }
+                else
+                {
+                    return FmhaPipeline{}(q_dram_window,
+                                          k_dram_window,
+                                          v_dram_window,
+                                          bias_dram_window,
+                                          randval_dram_window,
+                                          lse_dram_window,
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          variant,
+                                          variant_params,
+                                          block_indices,
+                                          smem_ptr,
+                                          dropout);
+                }
+            }();
+
+            // O DRAM and O DRAM window
+            auto o_dram = [&]() {
+                const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    o_ptr,
+                    make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                    make_tuple(kargs.stride_o, 1),
+                    number<FmhaPipeline::kAlignmentO>{},
+                    number<1>{});
+
+                return pad_tensor_view(
+                    o_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimV>{});
+            }();
+
+            auto o_dram_window = make_tile_window(
+                o_dram,
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                {i_m0, i_n1});
+
+            EpiloguePipeline{}(o_dram_window, o_acc_tile);
+        }
+        else
+        {
+            // TODO: Refine the logical here.
+            // In Decode case
+            //     1. we don't expect KV data reused by different ThreadGroups, bypass the cache
+            //     2. limit the LDS usage, as we want higher occupancy
+            // In Prefill case
+            //     1. we expect KV data reused by different ThreadGroups, use cache
+            //     2. use more LDS, as we want better memory latency hiding
+            // If SplitKV off, we don't expect Q data reused by different ThreadGroups, bypass the
+            // cache
+            constexpr bool PrefillCase = FmhaPipeline::kM0 >= 128;
+            // divide problem
+            const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
+
+            const index_t i_m0 = i_tile_m * FmhaPipeline::kM0;
+            const index_t i_n1 = i_tile_n * FmhaPipeline::kN1;
+
+            long_index_t batch_offset_q    = 0;
+            long_index_t batch_offset_k    = 0; // unused for paged-kvcache
+            long_index_t batch_offset_v    = 0; // unused for paged-kvcache
+            long_index_t batch_offset_bias = 0;
+            long_index_t batch_offset_lse  = 0;
+            long_index_t batch_offset_o    = 0;
+            // index_t kv_l2p_offset =
+            //     0; // logical-to-physical offset of seqlen_k coordinate. only used for
+            //     paged-kvcache
+
+            if constexpr(kIsGroupMode)
+            {
+                // get starting offset for each batch
+                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+
+                batch_offset_q = query_start * kargs.stride_q;
+                batch_offset_k = key_start * kargs.stride_k;
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    batch_offset_v = key_start * kargs.stride_v;
+                }
+                else
+                {
+                    batch_offset_v = key_start;
+                }
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias = query_start * kargs.stride_bias;
+                }
+
+                batch_offset_lse = query_start;
+                batch_offset_o   = query_start * kargs.stride_o;
+
+                // get real # queries & # keys under group mode
+                kargs.seqlen_q = kargs.seqstart_q_ptr[i_batch + 1] - kargs.seqstart_q_ptr[i_batch];
+
+                // # of required blocks is different in each groups, terminate unnecessary blocks
+                // earlier
+                if(kargs.seqlen_q <= i_m0)
+                {
+                    return;
+                }
+
+                if(kargs.seqlen_k_ptr != nullptr)
+                {
+                    kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+                }
+                else
+                {
+                    kargs.seqlen_k =
+                        kargs.seqstart_k_ptr[i_batch + 1] - kargs.seqstart_k_ptr[i_batch];
+                }
             }
             else
             {
+                batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
+                if constexpr(kStoreLSE)
+                {
+                    batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+                }
+                batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias =
+                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+                }
+            }
+
+            // for simplicity, batch stride we just modify the pointer
+            const index_t i_nhead_k = i_nhead / kargs.nhead_ratio_qk;
+
+            const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                     batch_offset_q;
+            const KDataType* k_ptr = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+                                     static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_k +
+                                     batch_offset_k;
+            const VDataType* v_ptr = reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+                                     static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_v +
+                                     batch_offset_v;
+
+            ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                               batch_offset_o;
+
+            // Q/K/V DRAM and DRAM window
+            const auto q_dram = [&] {
+                const auto q_dram_naive = [&] {
+                    {
+                        return make_naive_tensor_view<address_space_enum::global,
+                                                      memory_operation_enum::set,
+                                                      amd_buffer_coherence_enum::SYSTEM_NT1>(
+                            q_ptr,
+                            make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                            make_tuple(kargs.stride_q, 1),
+                            number<FmhaPipeline::kAlignmentQ>{},
+                            number<1>{});
+                    }
+                }();
+
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                {
+                    const auto seqlen_q   = kargs.seqlen_q;
+                    const auto q_dram_pad = pad_tensor_view(
+                        q_dram_naive,
+                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                        sequence<false, kPadHeadDimQ>{});
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                    constexpr index_t LDSLayerSize  = 256 / sizeof(QDataType);
+                    constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
+
+                    if constexpr(XorLengthFold > 1)
+                    {
+                        const auto q_dram_unmerged = transform_tensor_view(
+                            q_dram_pad,
+                            make_tuple(
+                                make_unmerge_transform(
+                                    make_tuple(seqlen_q / XorLengthFold, XorLengthFold)),
+                                make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
+                            make_tuple(sequence<0>{}, sequence<1>{}),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                        const auto q_dram_merged = transform_tensor_view(
+                            q_dram_unmerged,
+                            make_tuple(make_pass_through_transform(seqlen_q / XorLengthFold),
+                                       make_merge_transform_v3_division_mod(make_tuple(
+                                           XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+
+                        const auto q_dram_unmerged_xor = transform_tensor_view(
+                            q_dram_merged,
+                            make_tuple(make_pass_through_transform(seqlen_q / XorLengthFold),
+                                       make_unmerge_transform(make_tuple(
+                                           number<LDSLayerSize / FmhaPipeline::kAlignmentQ>{},
+                                           number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0>{}, sequence<1>{}),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                        const auto q_dram_permuted = transform_tensor_view(
+                            q_dram_unmerged_xor,
+                            make_tuple(
+                                make_xor_transform(
+                                    make_tuple(seqlen_q / XorLengthFold,
+                                               number<LDSLayerSize / FmhaPipeline::kAlignmentQ>{})),
+                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                        const auto q_dram_tmp = transform_tensor_view(
+                            q_dram_permuted,
+                            make_tuple(
+                                make_pass_through_transform(seqlen_q / XorLengthFold),
+                                make_unmerge_transform(
+                                    make_tuple(number<XorLengthFold>{},
+                                               number<FmhaPipeline::kQKHeaddim /
+                                                      FmhaPipeline::kAlignmentQ>{})),
+                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
+                            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                        return transform_tensor_view(
+                            q_dram_tmp,
+                            make_tuple(
+                                make_merge_transform_v3_division_mod(
+                                    make_tuple(seqlen_q / XorLengthFold, number<XorLengthFold>{})),
+                                make_merge_transform_v3_division_mod(make_tuple(
+                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
+                                    number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+                    }
+                    else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                    {
+                        const auto q_dram_unmerged = transform_tensor_view(
+                            q_dram_pad,
+                            make_tuple(
+                                make_pass_through_transform(seqlen_q),
+                                make_unmerge_transform(make_tuple(
+                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
+                                    number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0>{}, sequence<1>{}),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                        const auto q_dram_permuted = transform_tensor_view(
+                            q_dram_unmerged,
+                            make_tuple(
+                                make_xor_transform(make_tuple(seqlen_q,
+                                                              number<FmhaPipeline::kQKHeaddim /
+                                                                     FmhaPipeline::kAlignmentQ>{})),
+                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                        return transform_tensor_view(
+                            q_dram_permuted,
+                            make_tuple(
+                                make_pass_through_transform(seqlen_q),
+                                make_merge_transform_v3_division_mod(make_tuple(
+                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
+                                    number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+                    }
+                }
+                else
+                {
+                    return pad_tensor_view(
+                        q_dram_naive,
+                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                        sequence<false, kPadHeadDimQ>{});
+                }
+            }();
+
+            const auto make_k_dram = [&](const KDataType* data, index_t height) {
+                const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    data, // will update this pointer if using paged-kvcache
+                    make_tuple(height, kargs.hdim_q),
+                    make_tuple(kargs.stride_k, 1),
+                    number<FmhaPipeline::kAlignmentK>{},
+                    number<1>{});
+
+                const auto k_dram_pad = pad_tensor_view(
+                    k_dram_naive,
+                    make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                    sequence<false, kPadHeadDimQ>{});
+
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr index_t LDSLayerSize  = 256 / sizeof(KDataType);
+                constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    const auto k_dram_unmerged = transform_tensor_view(
+                        k_dram_pad,
+                        make_tuple(make_unmerge_transform(
+                                       make_tuple(height / XorLengthFold, XorLengthFold)),
+                                   make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto k_dram_merged = transform_tensor_view(
+                        k_dram_unmerged,
+                        make_tuple(make_pass_through_transform(height / XorLengthFold),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+                    const auto k_dram_unmerged_xor = transform_tensor_view(
+                        k_dram_merged,
+                        make_tuple(make_pass_through_transform(height / XorLengthFold),
+                                   make_unmerge_transform(make_tuple(
+                                       number<LDSLayerSize / FmhaPipeline::kAlignmentK>{},
+                                       number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                    const auto k_dram_permuted = transform_tensor_view(
+                        k_dram_unmerged_xor,
+                        make_tuple(
+                            make_xor_transform(
+                                make_tuple(height / XorLengthFold,
+                                           number<LDSLayerSize / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto k_dram_tmp = transform_tensor_view(
+                        k_dram_permuted,
+                        make_tuple(
+                            make_pass_through_transform(height / XorLengthFold),
+                            make_unmerge_transform(make_tuple(
+                                number<XorLengthFold>{},
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_view(
+                        k_dram_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(
+                                make_tuple(height / XorLengthFold, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
+                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    const auto k_dram_unmerged = transform_tensor_view(
+                        k_dram_pad,
+                        make_tuple(
+                            make_pass_through_transform(height),
+                            make_unmerge_transform(make_tuple(
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
+                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                    const auto k_dram_permuted = transform_tensor_view(
+                        k_dram_unmerged,
+                        make_tuple(
+                            make_xor_transform(make_tuple(
+                                height,
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_view(
+                        k_dram_permuted,
+                        make_tuple(
+                            make_pass_through_transform(height),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
+                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            };
+            const auto k_dram = [&]() {
+                {
+                    return make_k_dram(k_ptr, kargs.seqlen_k);
+                }
+            }();
+
+            const auto make_v_dram = [&](const VDataType* data, index_t length) {
                 const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    v_ptr,
-                    make_tuple(kargs.hdim_v, kargs.seqlen_k),
-                    make_tuple(kargs.stride_v, 1),
+                    data, // will update this pointer if using paged-kvcache
+                    make_tuple(length, kargs.hdim_v),
+                    make_tuple(kargs.hdim_v, 1),
                     number<FmhaPipeline::kAlignmentV>{},
                     number<1>{});
 
-                constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
-                return pad_tensor_view(
+                // TODO: Add kVHeadDim
+                constexpr index_t XorGroupSize =
+                    FmhaPipeline::Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{});
+
+                const auto v_dram_pad = pad_tensor_view(
                     v_dram_naive,
-                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV_, kPadSeqLenK>{});
-            }
-        }();
+                    make_tuple(number<FmhaPipeline::kK1>{}, number<FmhaPipeline::kN1>{}),
+                    sequence<kPadSeqLenK, false>{});
 
-        auto q_dram_window = make_tile_window(
-            q_dram,
-            [&]() {
-                if constexpr(FmhaPipeline::kQLoadOnce)
-                    return make_tuple(number<FmhaPipeline::kM0>{},
-                                      number<FmhaPipeline::kSubQKHeaddim>{});
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr index_t LDSLayerSize  = 256 / sizeof(VDataType);
+                constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    const auto v_dram_unmerged = transform_tensor_view(
+                        v_dram_pad,
+                        make_tuple(make_unmerge_transform(
+                                       make_tuple(length / XorLengthFold, XorLengthFold)),
+                                   make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto v_dram_merged = transform_tensor_view(
+                        v_dram_unmerged,
+                        make_tuple(make_pass_through_transform(length / XorLengthFold),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+                    const auto v_dram_unmerged_xor = transform_tensor_view(
+                        v_dram_merged,
+                        make_tuple(
+                            make_pass_through_transform(length / XorLengthFold),
+                            make_unmerge_transform(make_tuple(number<LDSLayerSize / XorGroupSize>{},
+                                                              number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                    const auto v_dram_permuted = transform_tensor_view(
+                        v_dram_unmerged_xor,
+                        make_tuple(
+                            make_xor_transform(make_tuple(length / XorLengthFold,
+                                                          number<LDSLayerSize / XorGroupSize>{})),
+                            make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto v_dram_tmp = transform_tensor_view(
+                        v_dram_permuted,
+                        make_tuple(make_pass_through_transform(length / XorLengthFold),
+                                   make_unmerge_transform(make_tuple(
+                                       number<XorLengthFold>{},
+                                       number<FmhaPipeline::kQKHeaddim / XorGroupSize>{})),
+                                   make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_view(
+                        v_dram_tmp,
+                        make_tuple(make_merge_transform_v3_division_mod(
+                                       make_tuple(length / XorLengthFold, number<XorLengthFold>{})),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
+                                                  number<XorGroupSize>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
                 else
-                    return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
-            }(),
-            {i_m0, 0});
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    const auto v_dram_unmerged = transform_tensor_view(
+                        v_dram_pad,
+                        make_tuple(make_pass_through_transform(length),
+                                   make_unmerge_transform(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
+                                                  number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
 
-        auto k_dram_window = make_tile_window(
-            k_dram, make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}), {0, 0});
+                    const auto v_dram_permuted = transform_tensor_view(
+                        v_dram_unmerged,
+                        make_tuple(make_xor_transform(make_tuple(
+                                       length, number<FmhaPipeline::kQKHeaddim / XorGroupSize>{})),
+                                   make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
 
-        auto v_dram_window =
-            make_tile_window(v_dram,
-                             make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                             {i_n1, 0});
-        /// FIXME: Before C++20, capturing structured binding variables are not supported. Remove
-        /// following copy capture of the 'i_nhead' if in C++20
-        const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto bias_dram_window_lengths =
-                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-            {
-                const BiasDataType* bias_ptr =
-                    reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
-                    batch_offset_bias;
-
-                const auto bias_dram = [&]() {
-                    const auto bias_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                        bias_ptr,
-                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                        make_tuple(kargs.stride_bias, 1),
-                        number<FmhaPipeline::kAlignmentBias>{},
-                        number<1>{});
-
-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
-                }();
-
-                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-            }
-            else
-            {
-                return make_null_tile_window(bias_dram_window_lengths);
-            }
-        }();
-
-        // lse
-        auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
-            if constexpr(kStoreLSE)
-            {
-                LSEDataType* lse_ptr =
-                    reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse + batch_offset_lse;
-
-                const auto lse_dram = [&]() {
-                    const auto lse_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                        lse_ptr,
-                        make_tuple(kargs.seqlen_q),
-                        make_tuple(1),
-                        number<1>{},
-                        number<1>{});
-
-                    return pad_tensor_view(
-                        lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
-                }();
-
-                return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
-            }
-            else
-            {
-                return make_null_tile_window(lse_dram_window_lengths);
-            }
-        }();
-
-        auto dropout = [&, i_nhead_ = i_nhead, i_batch_ = i_batch]() {
-            if constexpr(kHasDropout)
-            {
-                return BlockDropout{i_batch_,
-                                    i_nhead_,
-                                    kargs.num_head_q,
-                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_seed.val
-                                                                        : *kargs.drop_seed.ptr,
-                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_offset.val
-                                                                        : *kargs.drop_offset.ptr,
-                                    kargs.rp_undrop,
-                                    kargs.p_undrop_in_uint8_t,
-                                    kargs.is_store_randval};
-            }
-            else
-            {
-                return NullBlockDropout{};
+                    return transform_tensor_view(
+                        v_dram_permuted,
+                        make_tuple(make_pass_through_transform(length),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
+                                                  number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
             };
-        }();
 
-        auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto randval_dram_window_lengths =
-                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
-            if constexpr(kHasDropout)
-            {
-                RandValOutputDataType* rand_val_ptr =
-                    reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
-                    batch_offset_randval;
-
-                const auto randval_dram = [&]() {
-                    const auto randval_dram_naive =
-                        make_naive_tensor_view<address_space_enum::global>(
-                            rand_val_ptr,
-                            make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                            make_tuple(kargs.stride_randval, 1),
-                            number<1>{},
-                            number<1>{});
-
-                    return pad_tensor_view(randval_dram_naive,
-                                           randval_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
-                }();
-
-                return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
-            }
-            else
-            {
-                return make_null_tile_window(randval_dram_window_lengths);
-            }
-        }();
-
-        FmhaMask mask = [&]() {
-            if constexpr(kHasMask)
-                return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
-                    kargs.window_size_left,
-                    kargs.window_size_right,
-                    kargs.seqlen_q,
-                    kargs.seqlen_k,
-                    kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
-            else
-                return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
-        }();
-
-        // WA i_batch capture structure binding before c++20
-        auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
-            {
-                // data loading, shared by entire wg
-                // TODO: how to use s_read?
-                SaccDataType slope =
-                    *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
-                      i_batch_ * kargs.alibi_slope_stride + i_nhead_);
-#if CK_TILE_FMHA_FWD_FAST_EXP2
-                slope *= ck_tile::log2e_v<>;
-#endif
-                if constexpr(kHasMask)
+            const auto v_dram = [&]() {
                 {
-                    return make_alibi_from_lr_mask<SaccDataType, true>(slope,
-                                                                       kargs.window_size_left,
-                                                                       kargs.window_size_right,
-                                                                       kargs.seqlen_q,
-                                                                       kargs.seqlen_k,
-                                                                       kargs.mask_type);
+                    return make_v_dram(v_ptr, kargs.seqlen_k);
+                }
+            }();
+
+            auto q_dram_window = make_tile_window(
+                q_dram,
+                [&]() {
+                    if constexpr(FmhaPipeline::kQLoadOnce)
+                        return make_tuple(number<FmhaPipeline::kM0>{},
+                                          number<FmhaPipeline::kSubQKHeaddim>{});
+                    else
+                        return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
+                }(),
+                {i_m0, 0});
+
+            auto k_dram_window = make_tile_window(
+                k_dram,
+                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                {0, 0});
+
+            auto v_dram_window = make_tile_window(
+                v_dram,
+                make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                {0, 0});
+
+            /// FIXME: Before C++20, capturing structured binding variables are not supported.
+            /// Remove following copy capture of the 'i_nhead' if in C++20
+            const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto bias_dram_window_lengths =
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    const BiasDataType* bias_ptr =
+                        reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                        batch_offset_bias;
+
+                    const auto bias_dram = [&]() {
+                        const auto bias_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                bias_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_bias, 1),
+                                number<FmhaPipeline::kAlignmentBias>{},
+                                number<1>{});
+
+                        return pad_tensor_view(bias_dram_naive,
+                                               bias_dram_window_lengths,
+                                               sequence<false, kPadSeqLenK>{});
+                    }();
+
+                    return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
                 }
                 else
                 {
-                    return Alibi<SaccDataType, true>{
-                        slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                    return make_null_tile_window(bias_dram_window_lengths);
                 }
-            }
-            else
-            {
-                return EmptyPositionEncoding<SaccDataType>{};
-            }
-        }();
+            }();
 
-        AttentionVariant variant;
-        const auto variant_params = [&] {
-            if constexpr(kHasLogitsSoftCap)
-            {
-                return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
-                    mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
-            }
-            else
-            {
-                return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
-            }
-        }();
+            // lse acc
+            auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
+                if constexpr(kStoreLSE)
+                {
+                    LSEDataType* lse_ptr =
+                        reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
+                        batch_offset_lse;
 
-        BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
+                    const auto lse_dram = [&] {
+                        const auto lse_dram_naive = [&] {
+                            {
+                                return make_naive_tensor_view<address_space_enum::global>(
+                                    lse_ptr,
+                                    make_tuple(kargs.seqlen_q),
+                                    make_tuple(1),
+                                    number<1>{},
+                                    number<1>{});
+                            }
+                        }();
+                        return pad_tensor_view(
+                            lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
+                    }();
 
-        auto o_acc_tile = [&]() {
-            if constexpr(kDoFp8StaticQuant)
-            {
-                return FmhaPipeline{}(
-                    q_dram_window,
-                    identity{}, // q_element_func
-                    k_dram_window,
-                    identity{}, // k_element_func
-                    v_dram_window,
-                    identity{}, // v_element_func
-                    bias_dram_window,
-                    identity{}, // bias_element_func
-                    randval_dram_window,
-                    lse_dram_window,
-                    identity{},                                          // lse_element_func
-                    identity{},                                          // s_acc_element_func
-                    scales{kargs.scale_p},                               // p_compute_element_func
-                    composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
-                    mask,
-                    position_encoding,
-                    kargs.scale_s,
-                    variant,
-                    variant_params,
-                    block_indices,
-                    smem_ptr,
-                    dropout);
-            }
-            else
-            {
-                return FmhaPipeline{}(q_dram_window,
-                                      k_dram_window,
-                                      v_dram_window,
-                                      bias_dram_window,
-                                      randval_dram_window,
-                                      lse_dram_window,
-                                      mask,
-                                      position_encoding,
-                                      kargs.scale_s,
-                                      variant,
-                                      variant_params,
-                                      block_indices,
-                                      smem_ptr,
-                                      dropout);
-            }
-        }();
+                    return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+                }
+                else
+                {
+                    return make_null_tile_window(lse_dram_window_lengths);
+                }
+            }();
 
-        // O DRAM and O DRAM window
-        auto o_dram = [&]() {
-            const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                o_ptr,
-                make_tuple(kargs.seqlen_q, kargs.hdim_v),
-                make_tuple(kargs.stride_o, 1),
-                number<FmhaPipeline::kAlignmentO>{},
-                number<1>{});
+            FmhaMask mask = [&]() {
+                if constexpr(kHasMask)
+                    return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
+                        kargs.window_size_left,
+                        kargs.window_size_right,
+                        kargs.seqlen_q,
+                        kargs.seqlen_k,
+                        kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
+                else
+                    return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
+            }();
 
-            return pad_tensor_view(
-                o_dram_naive,
+            // WA i_batch capture structure binding before c++20
+            auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    // data loading, shared by entire wg
+                    // TODO: how to use s_read?
+                    SaccDataType slope =
+                        *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
+                          i_batch_ * kargs.alibi_slope_stride + i_nhead_);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    slope *= ck_tile::log2e_v<>;
+#endif
+                    if constexpr(kHasMask)
+                    {
+                        return make_alibi_from_lr_mask<SaccDataType, true, 32>(
+                            slope,
+                            kargs.window_size_left,
+                            kargs.window_size_right,
+                            kargs.seqlen_q,
+                            kargs.seqlen_k,
+                            kargs.mask_type);
+                    }
+                    else
+                    {
+                        return Alibi<SaccDataType, true, 32>{
+                            slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                    }
+                }
+                else
+                {
+                    return EmptyPositionEncoding<SaccDataType>{};
+                }
+            }();
+
+            auto o_acc_tile = [&]() {
+                if constexpr(PrefillCase)
+                {
+                    // allocate double lds
+                    // add __restrict__ here to avoid aliasing
+                    __shared__ char smem_ptrk0
+                        [FmhaPipeline::Policy::template GetSmemSizeK<typename FmhaPipeline::Problem,
+                                                                     true>()];
+                    __shared__ char smem_ptrk1
+                        [FmhaPipeline::Policy::template GetSmemSizeK<typename FmhaPipeline::Problem,
+                                                                     true>()];
+                    __shared__ char smem_ptrv0[FmhaPipeline::Policy::template GetSmemSizeV<
+                        typename FmhaPipeline::Problem>()];
+                    __shared__ char smem_ptrv1[FmhaPipeline::Policy::template GetSmemSizeV<
+                        typename FmhaPipeline::Problem>()];
+
+                    return FmhaPipeline{}(q_dram_window,
+                                          k_dram_window,
+                                          v_dram_window,
+                                          bias_dram_window,
+                                          lse_dram_window,
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          smem_ptrk0,
+                                          smem_ptrk1,
+                                          smem_ptrv0,
+                                          smem_ptrv1);
+                }
+                else
+                {
+                    __shared__ char smem_ptr[GetSmemSize()];
+                    return FmhaPipeline{}(q_dram_window,
+                                          k_dram_window,
+                                          v_dram_window,
+                                          bias_dram_window,
+                                          lse_dram_window,
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          smem_ptr);
+                }
+            }();
+
+            // Oacc DRAM and Oacc DRAM window
+            auto o_dram = [&] {
+                const auto o_dram_naive = [&] {
+                    {
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            o_ptr,
+                            make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                            make_tuple(kargs.stride_o, 1),
+                            number<FmhaPipeline::kAlignmentOacc>{},
+                            number<1>{});
+                    }
+                }();
+
+                return pad_tensor_view(
+                    o_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimV>{});
+            }();
+
+            auto o_dram_window = make_tile_window(
+                o_dram,
                 make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                sequence<kPadSeqLenQ, kPadHeadDimV>{});
-        }();
+                {i_m0, i_n1});
 
-        auto o_dram_window =
-            make_tile_window(o_dram,
-                             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                             {i_m0, i_n1});
-
-        EpiloguePipeline{}(o_dram_window, o_acc_tile);
+            EpiloguePipeline{}(o_dram_window, o_acc_tile);
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
index d8cd006c60..9a3e8ac304 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
@@ -30,6 +30,7 @@ struct FmhaFwdPagedKVKernel
     using EpiloguePipeline                        = ck_tile::remove_cvref_t<EpiloguePipeline_>;
     static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
     static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
+
     static_assert(kBlockPerCu > 0);
     static constexpr ck_tile::index_t kBlockPerCuInput = FmhaPipeline::Problem::kBlockPerCu;
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
index 99ee912db9..ee1236d465 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -14,6 +14,7 @@ struct FmhaFwdSplitKVCombineKernel
     static constexpr index_t kNumWarps   = FmhaPipeline::kNumWarps;
     static constexpr index_t kBlockSize  = FmhaPipeline::kBlockSize;
     static constexpr index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
+
     static_assert(kBlockPerCu > 0);
     static constexpr index_t kBlockPerCuInput = FmhaPipeline::Problem::kBlockPerCu;
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 501aa26667..c50537f3fe 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -26,6 +26,7 @@ struct FmhaFwdSplitKVKernel
     using EpiloguePipeline                        = ck_tile::remove_cvref_t<EpiloguePipeline_>;
     static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
     static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
+
     static_assert(kBlockPerCu > 0);
     static constexpr ck_tile::index_t kBlockPerCuInput = FmhaPipeline::Problem::kBlockPerCu;
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
index 1d95bc2801..9a31498dd1 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
@@ -347,22 +347,19 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
         const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
 
         auto bias_dram_window =
-            make_tile_window(Policy::template TransformXDramTensorView<QDataType>(
-                                 bias_dram_block_window_tmp.get_bottom_tensor_view()),
+            make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
                              bias_dram_block_window_tmp.get_window_lengths(),
                              {seqlen_q_start, bias_origin.at(number<1>{})},
                              Policy::template MakeBiasTileDistribution<Problem>());
 
         auto bias_lds = make_tensor_view<address_space_enum::lds>(
-            bias_lds_ptr, Policy::template MakeBiasLdsWriteBlockDescriptor<Problem>());
+            bias_lds_ptr, Policy::template MakeBiasLdsBlockDescriptor<Problem>());
         auto bias_lds_write_window =
             make_tile_window(bias_lds, make_tuple(number<kM0>{}, number<kN0>{}), {0, 0});
 
-        auto bias_lds_read = make_tensor_view<address_space_enum::lds>(
-            bias_lds_ptr, Policy::template MakeBiasLdsReadBlockDescriptor<Problem>());
         auto bias_s_lds_read_window =
-            make_tile_window(bias_lds_read,
-                             make_tuple(number<kM0>{}, number<kN0>{}),
+            make_tile_window(bias_lds_write_window.get_bottom_tensor_view(),
+                             bias_lds_write_window.get_window_lengths(),
                              bias_lds_write_window.get_window_origin(),
                              Policy::template MakeBiasSTileDistribution<decltype(gemm_0)>());
 
@@ -500,8 +497,11 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
                 // STAGE 2, Scale, Add bias, Mask, Softmax, Dropout
                 if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                 {
-                    async_load_tile(bias_lds_write_window, bias_dram_window);
-                    __builtin_amdgcn_s_waitcnt(3952);
+                    const auto bias_tile    = load_tile(bias_dram_window);
+                    auto shuffled_bias_tile = make_static_distributed_tensor<BiasDataType>(
+                        Policy::template MakeShuffledBiasTileDistribution<Problem>());
+                    shuffle_tile(shuffled_bias_tile, bias_tile);
+                    store_tile(bias_lds_write_window, shuffled_bias_tile);
                     block_sync_lds();
                     auto bias_s_tile = load_tile(bias_s_lds_read_window);
                     tile_elementwise_inout(
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp
index 65f70c4f62..3112070271 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp
@@ -323,22 +323,19 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
         const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
 
         auto bias_dram_window =
-            make_tile_window(Policy::template TransformXDramTensorView<QDataType>(
-                                 bias_dram_block_window_tmp.get_bottom_tensor_view()),
+            make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
                              bias_dram_block_window_tmp.get_window_lengths(),
                              {bias_origin.at(number<0>{}), seqlen_kv_start},
                              Policy::template MakeBiasTileDistribution<Problem>());
 
         auto bias_lds = make_tensor_view<address_space_enum::lds>(
-            bias_lds_ptr, Policy::template MakeBiasLdsWriteBlockDescriptor<Problem>());
+            bias_lds_ptr, Policy::template MakeBiasLdsBlockDescriptor<Problem>());
         auto bias_lds_write_window =
             make_tile_window(bias_lds, make_tuple(number<kM0>{}, number<kN0>{}), {0, 0});
 
-        auto bias_lds_read = make_tensor_view<address_space_enum::lds>(
-            bias_lds_ptr, Policy::template MakeBiasLdsReadBlockDescriptor<Problem>());
         auto bias_s_lds_read_window =
-            make_tile_window(bias_lds_read,
-                             make_tuple(number<kM0>{}, number<kN0>{}),
+            make_tile_window(bias_lds_write_window.get_bottom_tensor_view(),
+                             bias_lds_write_window.get_window_lengths(),
                              bias_lds_write_window.get_window_origin(),
                              Policy::template MakeBiasSTileDistribution<decltype(gemm_0)>());
 
@@ -490,8 +487,11 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
                 // STAGE 2, Scale, Add bias, Mask, Softmax, Dropout
                 if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                 {
-                    async_load_tile(bias_lds_write_window, bias_dram_window);
-                    __builtin_amdgcn_s_waitcnt(3952);
+                    const auto bias_tile    = load_tile(bias_dram_window);
+                    auto shuffled_bias_tile = make_static_distributed_tensor<BiasDataType>(
+                        Policy::template MakeShuffledBiasTileDistribution<Problem>());
+                    shuffle_tile(shuffled_bias_tile, bias_tile);
+                    store_tile(bias_lds_write_window, shuffled_bias_tile);
                     block_sync_lds();
                     auto bias_s_tile = load_tile(bias_s_lds_read_window);
                     tile_elementwise_inout(
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index aa2ec99590..68ead7c765 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -43,7 +43,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm0BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
 
-        using WarpGemm = WarpGemmMfmaDispatcher<
+        using WarpGemm = WarpGemmDispatcher<
             typename Problem::QDataType,
             typename Problem::KDataType,
             typename Problem::AccDataType,
@@ -78,18 +78,18 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm1WarpTile>>;
 
         using WarpGemm =
-            WarpGemmMfmaDispatcher<typename Problem::GemmDataType,
-                                   typename Problem::OGradDataType,
-                                   typename Problem::AccDataType,
-                                   Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
-                                   Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
-                                   Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
-                                   true,
-                                   false, // SwizzleAccess
-                                   false, // UseStructuredSparsity
-                                   (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
-                                       ? WGAttrNumAccessEnum ::Double
-                                       : WGAttrNumAccessEnum ::Single>;
+            WarpGemmDispatcher<typename Problem::GemmDataType,
+                               typename Problem::OGradDataType,
+                               typename Problem::AccDataType,
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
+                               true,
+                               false, // SwizzleAccess
+                               false, // UseStructuredSparsity
+                               (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
+                                   ? WGAttrNumAccessEnum ::Double
+                                   : WGAttrNumAccessEnum ::Single>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
@@ -115,7 +115,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm2BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm2WarpTile>>;
 
-        using WarpGemm = WarpGemmMfmaDispatcher<
+        using WarpGemm = WarpGemmDispatcher<
             typename Problem::OGradDataType,
             typename Problem::VDataType,
             typename Problem::AccDataType,
@@ -150,18 +150,18 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm3WarpTile>>;
 
         using WarpGemm =
-            WarpGemmMfmaDispatcher<typename Problem::GemmDataType,
-                                   typename Problem::QDataType,
-                                   typename Problem::AccDataType,
-                                   Problem::BlockFmhaShape::Gemm3WarpTile::at(number<0>{}),
-                                   Problem::BlockFmhaShape::Gemm3WarpTile::at(number<1>{}),
-                                   Problem::BlockFmhaShape::Gemm3WarpTile::at(number<2>{}),
-                                   true,
-                                   false, // SwizzleAccess
-                                   false, // UseStructuredSparsity
-                                   (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
-                                       ? WGAttrNumAccessEnum ::Double
-                                       : WGAttrNumAccessEnum ::Single>;
+            WarpGemmDispatcher<typename Problem::GemmDataType,
+                               typename Problem::QDataType,
+                               typename Problem::AccDataType,
+                               Problem::BlockFmhaShape::Gemm3WarpTile::at(number<0>{}),
+                               Problem::BlockFmhaShape::Gemm3WarpTile::at(number<1>{}),
+                               Problem::BlockFmhaShape::Gemm3WarpTile::at(number<2>{}),
+                               true,
+                               false, // SwizzleAccess
+                               false, // UseStructuredSparsity
+                               (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
+                                   ? WGAttrNumAccessEnum ::Double
+                                   : WGAttrNumAccessEnum ::Single>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
@@ -187,14 +187,13 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm4BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm4WarpTile>>;
 
-        using WarpGemm =
-            WarpGemmMfmaDispatcher<typename Problem::GemmDataType,
-                                   typename Problem::KDataType,
-                                   typename Problem::AccDataType,
-                                   Problem::BlockFmhaShape::Gemm4WarpTile::at(number<0>{}),
-                                   Problem::BlockFmhaShape::Gemm4WarpTile::at(number<1>{}),
-                                   Problem::BlockFmhaShape::Gemm4WarpTile::at(number<2>{}),
-                                   false>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::GemmDataType,
+                                            typename Problem::KDataType,
+                                            typename Problem::AccDataType,
+                                            Problem::BlockFmhaShape::Gemm4WarpTile::at(number<0>{}),
+                                            Problem::BlockFmhaShape::Gemm4WarpTile::at(number<1>{}),
+                                            Problem::BlockFmhaShape::Gemm4WarpTile::at(number<2>{}),
+                                            false>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
index d1fb1669c9..6259e5b473 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
@@ -25,7 +25,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
 
         constexpr auto SwizzleA = false;
-        using WarpGemm          = WarpGemmMfmaDispatcher< //
+        using WarpGemm          = WarpGemmDispatcher< //
             typename Problem::QDataType,
             typename Problem::KDataType,
             typename Problem::AccDataType,
@@ -66,7 +66,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm2WarpTile>>;
 
         constexpr auto SwizzleA = false;
-        using WarpGemm          = WarpGemmMfmaDispatcher< //
+        using WarpGemm          = WarpGemmDispatcher< //
             typename Problem::OGradDataType,
             typename Problem::VDataType,
             typename Problem::AccDataType,
@@ -106,7 +106,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                    typename BlockFmhaShape::Gemm4BlockWarps,
                    typename BlockFmhaShape::Gemm4WarpTile>>;
 
-        using WarpGemm = WarpGemmMfmaDispatcher< //
+        using WarpGemm = WarpGemmDispatcher< //
             typename Problem::GemmDataType,
             typename Problem::KDataType,
             typename Problem::AccDataType,
@@ -551,11 +551,9 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                                             Problem::BlockFmhaShape::kQKHeaddim>();
     }
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBiasLdsWriteBlockDescriptor()
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBiasLdsBlockDescriptor()
     {
-        return MakeXLdsWriteBlockDescriptor<typename Problem::BiasDataType,
-                                            Problem::BlockFmhaShape::kM0,
-                                            Problem::BlockFmhaShape::kN0>();
+        return BlockFmhaBwdPipelineDefaultPolicy::MakeBiasLdsBlockDescriptor<Problem>();
     }
 
     template <typename Problem, bool Transposed = false>
@@ -684,13 +682,6 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                                            Problem::BlockFmhaShape::kM0,
                                            Problem::BlockFmhaShape::kQKHeaddim>();
     }
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBiasLdsReadBlockDescriptor()
-    {
-        return MakeXLdsReadBlockDescriptor<typename Problem::BiasDataType,
-                                           Problem::BlockFmhaShape::kM0,
-                                           Problem::BlockFmhaShape::kN0>();
-    }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeQRegSliceBlockDescriptor()
@@ -966,25 +957,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBiasTileDistribution()
     {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-
-        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
-
-        constexpr index_t N1 = min(static_cast<index_t>(GetAlignmentBias<Problem>()),
-                                   kMPerBlock * kNPerBlock / kBlockSize);
-        constexpr index_t N0 = kNPerBlock / N1;
-        constexpr index_t M0 = kBlockSize / get_warp_size();
-        constexpr index_t M1 = get_warp_size() / N0;
-        constexpr index_t M2 = kMPerBlock / M1 / M0;
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<>,
-                                       tuple<sequence<M0, M1, M2>, sequence<N0, N1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<0>, sequence<1, 0>>,
-                                       sequence<2, 1>,
-                                       sequence<1, 2>>{});
+        return BlockFmhaBwdPipelineDefaultPolicy::MakeShuffledBiasTileDistribution<Problem>();
     }
 
     template <typename BlockGemm>
@@ -1048,7 +1021,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
     {
         if constexpr(Problem::BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
             return sizeof(typename Problem::BiasDataType) *
-                   MakeBiasLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+                   MakeBiasLdsBlockDescriptor<Problem>().get_element_space_size();
         else
             return 0;
     }
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
index cf70dff63f..45a1c8f4b8 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
@@ -11,6 +11,7 @@ enum class BlockFmhaPipelineEnum
     QRKSVS = 0,
     QRKSVS_ASYNC,
     QSKSVS,
+    QRKSVS_ASYNC_TRLOAD,
 };
 
 template <BlockFmhaPipelineEnum>
@@ -32,4 +33,10 @@ struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QSKSVS>
     static constexpr const char* name = "qs";
 };
 
+template <>
+struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD>
+{
+    static constexpr const char* name = "qr_async_trload";
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index 20b30b7417..86ac713b6f 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -22,6 +22,7 @@ template <typename QDataType_,
           bool kIsGroupMode_,
           typename AttentionVariant_,
           typename FmhaMask_,
+          bool kUseTrLoad_,
           typename Traits_>
 struct BlockFmhaPipelineProblem
 {
@@ -46,6 +47,7 @@ struct BlockFmhaPipelineProblem
     static constexpr index_t kBlockSize     = BlockFmhaShape::NumWarps * get_warp_size();
 
     static constexpr bool kIsGroupMode = kIsGroupMode_;
+    static constexpr bool kUseTrLoad   = kUseTrLoad_;
 
     // attributes from traits
     static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
new file mode 100644
index 0000000000..39d8814692
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
@@ -0,0 +1,1177 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+// This pipeline is qkv all located in LDS
+template <typename Problem_, typename Policy_ = BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy>
+struct BlockFmhaPipelineQRKSVSAsyncTrload
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    using Problem               = remove_cvref_t<Problem_>;
+    using Policy                = remove_cvref_t<Policy_>;
+    using QDataType             = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType             = remove_cvref_t<typename Problem::KDataType>;
+    using VDataType             = remove_cvref_t<typename Problem::VDataType>;
+    using SaccDataType          = remove_cvref_t<typename Problem::SaccDataType>;
+    using SMPLComputeDataType   = remove_cvref_t<typename Problem::SMPLComputeDataType>;
+    using BiasDataType          = remove_cvref_t<typename Problem::BiasDataType>;
+    using RandValOutputDataType = remove_cvref_t<typename Problem::RandValOutputDataType>;
+    using LSEDataType           = remove_cvref_t<typename Problem::LSEDataType>;
+    using PDataType             = remove_cvref_t<typename Problem::PDataType>;
+    using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
+    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant      = remove_cvref_t<typename Problem::AttentionVariant>;
+    using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
+
+    using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
+    using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
+    static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once
+    static_assert(kQLoadOnce == Policy::QLoadOnce);
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
+    static constexpr index_t kNWarp        = BlockFmhaShape::Gemm0BlockWarps::at(I1);
+    static constexpr index_t kNXdl         = BlockFmhaShape::Gemm0WarpTile::at(I1);
+
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
+    // static_assert(Problem::kPadSeqLenQ == true && Problem::kPadHeadDimQ == true &&
+    //               Problem::kPadHeadDimV == true);
+
+    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK  = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ =
+        Problem::kPadHeadDimQ; // support multiple of vector(like 8x)
+    static constexpr bool kPadHeadDimV =
+        Problem::kPadHeadDimV; // support multiple of vector(like 8x)
+
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr bool kHasDropout       = Problem::kHasDropout;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kHasUnevenSplits  = true;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
+
+    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
+    // ... together with tensor distribution. tensor dist should able to overwrite this
+    static constexpr index_t kAlignmentQ = Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK = Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentV = []() {
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            return Policy::template GetAlignmentV<Problem>();
+        else
+            return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
+    }();
+
+    static constexpr index_t kAlignmentOacc = Policy::template GetAlignmentO<Problem>();
+
+    static constexpr index_t kAlignmentBias =
+        kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
+
+    static constexpr index_t kBlockPerCu = []() {
+        if constexpr(Problem::kBlockPerCu != -1)
+            return Problem::kBlockPerCu;
+        else
+        {
+            if constexpr(kQKHeaddim <= 32)
+            {
+                return 2;
+            }
+            else if constexpr(kQKHeaddim <= 64)
+            {
+                return 3;
+            }
+            else if constexpr(kQKHeaddim <= 128)
+            {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || kM0 >= 256)
+                    return 1;
+                else
+                    return 2;
+            }
+            else if constexpr(kQKHeaddim <= 256)
+            {
+                return 1;
+            }
+            else
+            {
+                return 1;
+            }
+        }
+    }();
+
+    static constexpr const char* name = "qr_async_trload";
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    // Decode
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename LSEaccDramBlockWindowTmp,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
+               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
+               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp,        // M0*1 tile
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               void* smem_ptr) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kSubQKHeaddim == QDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I1],
+                      "wrong!");
+        ignore = bias_dram_block_window_tmp;
+        ignore = position_encoding;
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetPVBlockGemm<Problem>();
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        auto o_acc = OaccBlockTileType{};
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(o_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        // init M, L
+        auto m = MLBlockTileType{};
+        auto l = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        const auto q_origin = q_dram_block_window_tmp.get_window_origin();
+        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
+            mask.GetTileRangeAlongX(q_origin.at(I0), number<kM0>{}, number<kN0>{});
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
+        {
+            const index_t logical_num_total_loop =
+                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
+            if(logical_num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse_acc =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse_acc, -numeric<SMPLComputeDataType>::infinity());
+
+                    if(get_thread_local_1d_id() < kM0)
+                    {
+                        store_tile(lse_acc_dram_window_tmp, lse_acc);
+                    }
+                }
+
+                // Note: here occ are all cleard, return it
+                // Note: q loaded but no fence, ignore it.
+                return o_acc;
+            }
+        }
+
+        // Q tile in LDS
+        auto q_dram_window = make_tile_window(
+            q_dram_block_window_tmp, Policy::template MakeQDramTileDistribution<Problem>());
+
+        auto q_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptr), Policy::template MakeQLdsBlockDescriptor<Problem>());
+
+        auto q_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptr),
+            Policy::template MakeQLdsBlockDescriptor<Problem, true>());
+
+        auto q_lds_store_window =
+            make_tile_window(q_lds_write_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto q_lds_read_window =
+            make_tile_window(q_lds_read_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeQRegTileDistribution<Problem>());
+
+        async_load_tile(q_lds_store_window, q_dram_window);
+
+        // K tile in LDS
+        const index_t physical_seqlen_k_start = logical_seqlen_k_start;
+        const index_t physical_seqlen_k_end   = logical_seqlen_k_end;
+        // make sure the first tile is completely located in page-block (page-block size should be
+        // divisible by kN0)
+        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
+        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
+        const index_t aligned_physical_seqlen_k_start = physical_seqlen_k_start;
+
+        auto k_dram_window = make_tile_window(
+            k_dram_block_window_tmp, Policy::template MakeKDramTileDistribution<Problem>());
+
+        auto k_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType*>(smem_ptr), Policy::template MakeKLdsBlockDescriptor<Problem>());
+        auto k_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType*>(smem_ptr),
+            Policy::template MakeKLdsBlockDescriptor<Problem, false, true>());
+
+        auto k_lds_write_window =
+            make_tile_window(k_lds_write_view,
+                             Policy::template MakeKLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+        auto k_lds_read_window =
+            make_tile_window(k_lds_read_view,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             {0, 0},
+                             Policy::template MakeKRegTileDistribution<Problem>());
+
+        // S tile in LDS
+        auto s_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<SaccDataType*>(reinterpret_cast<char*>(smem_ptr) +
+                                            Policy::template GetSmemSizeK<Problem>()),
+            Policy::template MakeSLdsBlockDescriptor<Problem>());
+        auto s_write_lds_window = make_tile_window(
+            s_lds, Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+        auto s_read_lds_window =
+            make_tile_window(s_lds,
+                             Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeSRegTileDistribution<Problem>());
+
+        // V tile in LDS
+        auto v_dram_window = make_tile_window(
+            v_dram_block_window_tmp, Policy::template MakeVDramTileDistribution<Problem>());
+
+        auto v_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
+                                         Policy::template GetSmemSizeK<Problem>() +
+                                         Policy::template GetSmemSizeS<Problem>()),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+        auto v_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
+                                         Policy::template GetSmemSizeK<Problem>() +
+                                         Policy::template GetSmemSizeS<Problem>()),
+            Policy::template MakeVLdsBlockDescriptor<Problem, true>());
+        auto v_lds_write_window =
+            make_tile_window(v_lds_write_view,
+                             Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto v_lds_read_window =
+            make_tile_window(v_lds_read_view,
+                             make_tuple(number<kK1>{}, number<kN1>{}),
+                             {0, 0},
+                             Policy::template MakeVRegTileDistribution<Problem>());
+
+        block_sync_lds_direct_load<0>();
+        auto q_tile = load_tile(q_lds_read_window);
+
+        const index_t num_total_loop =
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+
+        index_t i_total_loops      = 0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        static_assert(1 <= k0_loops);
+        static_assert(1 <= k1_loops);
+
+        block_sync_lds();
+        async_load_tile(k_lds_write_window, k_dram_window);
+
+        constexpr index_t k_vmem_insts = k_dram_window.get_num_of_access();
+        constexpr index_t v_vmem_insts = v_dram_window.get_num_of_access();
+
+        do
+        {
+            block_sync_lds();
+            async_load_tile(v_lds_write_window, v_dram_window); // prefetch load v tile
+
+            // move V tile windows
+            move_tile_window(v_dram_window, {kN0, 0});
+
+            // STAGE 1, QK gemm
+            clear_tile(s_acc); // initialize C
+
+            if constexpr(1 < k0_loops)
+            {
+                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                    if constexpr(i_k0 == 0)
+                    {
+                        block_sync_lds_direct_load<v_vmem_insts>();
+                    }
+                    else
+                    {
+                        block_sync_lds_direct_load<0>();
+                    }
+
+                    auto k_tile = load_tile(k_lds_read_window);
+
+                    gemm_0(s_acc,
+                           get_slice_tile(q_tile,
+                                          sequence<0, i_k0 * kK0>{},
+                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           k_tile);
+
+                    // loop over along the [K]ey head dimension
+                    move_tile_window(k_dram_window, {0, kK0});
+                    block_sync_lds();
+                    async_load_tile(k_lds_write_window, k_dram_window);
+                });
+                // move back to the origin
+                move_tile_window(k_dram_window, {0, -kK0 * (k0_loops - 1)});
+            }
+
+            if constexpr(k0_loops == 1)
+            {
+                block_sync_lds_direct_load<v_vmem_insts>();
+            }
+            else
+            {
+                block_sync_lds_direct_load<0>();
+            }
+
+            auto k_tile = load_tile(k_lds_read_window);
+
+            gemm_0(s_acc,
+                   get_slice_tile(q_tile,
+                                  sequence<0, (k0_loops - 1) * kK0>{},
+                                  sequence<kM0, k0_loops * kK0>{}),
+                   k_tile);
+
+            if constexpr(kHasUnevenSplits)
+            {
+                if(i_total_loops == (num_total_loop - 1))
+                {
+                    const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+                    set_tile_if(s_acc,
+                                -numeric<SMPLComputeDataType>::infinity(),
+                                [&,
+                                 physical_seqlen_k_start_ = physical_seqlen_k_start,
+                                 physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
+                                    const auto col = k_origin.at(I0) + tile_idx.at(I1);
+
+                                    {
+                                        return physical_seqlen_k_end_ <= col;
+                                    }
+                                });
+                }
+            }
+
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+
+                bool need_perpixel_check =
+                    mask.IsEdgeTile(q_origin.at(I0), k_origin.at(I0), number<kM0>{}, number<kN0>{});
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(I0) + tile_idx.at(I0);
+                            const auto col = k_origin.at(I0) + tile_idx.at(I1);
+                            return mask.IsOutOfBound(row, col);
+                        });
+                }
+            }
+
+            // move K tile windows after current status checked
+            // prefetch next-tile along [K]ey sequence length dimension
+            move_tile_window(k_dram_window, {kN0, 0});
+
+            block_sync_lds();
+            async_load_tile(k_lds_write_window, k_dram_window);
+
+            // Gemm1
+            auto s_new = [&]() {
+                if constexpr(kNWarp > 1)
+                {
+                    auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+
+                    store_tile(s_write_lds_window, s);
+                    block_sync_lds();
+                    return load_tile(s_read_lds_window);
+                }
+                else
+                {
+                    return cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+                }
+            }();
+
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s_new,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            // Set CrossWarp to false will trigger better strategy on gfx950, but will cause
+            // performance regression because of un-coexecutable packed math, silent it for now
+            block_tile_reduce_sync(
+                m_local, f_max, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s_new.get_tile_distribution()); // Pcompute{j}
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                auto row_max         = scale_s * get_validated_m(m[i_idx]);
+                sweep_tile_span(p_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
+                        }
+                    }
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(
+                rowsum_p, f_sum, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            auto p_tile = make_static_distributed_tensor<PDataType>(
+                Policy::template MakePRegTileDistribution<Problem>());
+            p_tile.get_thread_buffer() = cast_tile<PDataType>(p_compute).get_thread_buffer();
+
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                const auto tmp       = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
+                    }
+                }();
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            block_sync_lds_direct_load<k_vmem_insts>();
+
+            auto v_tile = load_tile_transpose(v_lds_read_window);
+
+            if constexpr(1 < k1_loops)
+            {
+                static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
+                    gemm_1(o_acc,
+                           get_slice_tile(p_tile,
+                                          sequence<0, i_k1 * kK1>{},
+                                          sequence<kM0, (i_k1 + 1) * kK1>{}),
+                           v_tile);
+
+                    // loop over along the [V]alue Sequence length
+                    move_tile_window(v_lds_read_window, {kK1, 0});
+                    v_tile = load_tile_transpose(v_lds_read_window);
+                });
+                // move back to the origin
+                move_tile_window(v_lds_read_window, {-kK1 * (k1_loops - 1), 0});
+            }
+
+            gemm_1(o_acc,
+                   get_slice_tile(p_tile,
+                                  sequence<0, (k1_loops - 1) * kK1>{},
+                                  sequence<kM0, k1_loops * kK1>{}),
+                   v_tile);
+
+        } while(++i_total_loops < num_total_loop);
+
+        if constexpr(kStoreLSE)
+        {
+            // store lse acc
+            auto lse_acc = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans();
+            sweep_tile_span(lse_acc_spans[I0], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
+                }
+            });
+
+            if(get_thread_local_1d_id() < kM0)
+            {
+                store_tile(lse_acc_dram_window_tmp, lse_acc);
+            }
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[I0], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        return o_acc;
+    }
+
+    // Prefill, double lds
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename LSEaccDramBlockWindowTmp,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
+               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
+               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp,        // M0*1 tile
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               void* __restrict__ smem_ptrk0,
+               void* __restrict__ smem_ptrk1,
+               void* __restrict__ smem_ptrv0,
+               void* __restrict__ smem_ptrv1) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kSubQKHeaddim == QDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I1],
+                      "wrong!");
+        ignore = bias_dram_block_window_tmp;
+        ignore = position_encoding;
+
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetPVBlockGemm<Problem>();
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        auto o_acc = OaccBlockTileType{};
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(o_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        // init M, L
+        auto m = MLBlockTileType{};
+        auto l = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        const auto q_origin = q_dram_block_window_tmp.get_window_origin();
+        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
+            mask.GetTileRangeAlongX(q_origin.at(I0), number<kM0>{}, number<kN0>{});
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
+        {
+            const index_t logical_num_total_loop =
+                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
+            if(logical_num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse_acc =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse_acc, -numeric<SMPLComputeDataType>::infinity());
+
+                    if(get_thread_local_1d_id() < kM0)
+                    {
+                        store_tile(lse_acc_dram_window_tmp, lse_acc);
+                    }
+                }
+
+                // Note: here occ are all cleard, return it
+                // Note: q loaded but no fence, ignore it.
+                return o_acc;
+            }
+        }
+
+        // Q tile in LDS
+        auto q_dram_window = make_tile_window(
+            q_dram_block_window_tmp, Policy::template MakeQDramTileDistribution<Problem>());
+
+        auto q_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptrk0),
+            Policy::template MakeQLdsBlockDescriptor<Problem>());
+
+        auto q_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptrk0),
+            Policy::template MakeQLdsBlockDescriptor<Problem, true>());
+
+        auto q_lds_store_window =
+            make_tile_window(q_lds_write_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto q_lds_read_window =
+            make_tile_window(q_lds_read_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeQRegTileDistribution<Problem>());
+
+        async_load_tile(q_lds_store_window, q_dram_window);
+        block_sync_lds_direct_load<0>();
+        auto q_tile = load_tile(q_lds_read_window);
+
+        // K tile in LDS
+        const index_t physical_seqlen_k_start = logical_seqlen_k_start;
+        const index_t physical_seqlen_k_end   = logical_seqlen_k_end;
+        // make sure the first tile is completely located in page-block (page-block size should be
+        // divisible by kN0)
+        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
+        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
+        const index_t aligned_physical_seqlen_k_start = physical_seqlen_k_start;
+
+        auto k_dram_window = make_tile_window(
+            k_dram_block_window_tmp, Policy::template MakeKDramTileDistribution<Problem, true>());
+
+        auto k_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType* __restrict__>(smem_ptrk0),
+            Policy::template MakeKLdsBlockDescriptor<Problem, true>());
+
+        auto k_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType* __restrict__>(smem_ptrk0),
+            Policy::template MakeKLdsBlockDescriptor<Problem, true, true>());
+
+        auto k_lds_write_window =
+            make_tile_window(k_lds_write_view,
+                             Policy::template MakeKLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto k_lds_read_window =
+            make_tile_window(k_lds_read_view,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             {0, 0},
+                             Policy::template MakeKRegTileDistribution<Problem>());
+
+        // S tile in LDS
+        auto s_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<SaccDataType*>(reinterpret_cast<char*>(smem_ptrk0) +
+                                            Policy::template GetSmemSizeK<Problem>()),
+            Policy::template MakeSLdsBlockDescriptor<Problem>());
+        auto s_write_lds_window = make_tile_window(
+            s_lds, Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+        auto s_read_lds_window =
+            make_tile_window(s_lds,
+                             Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeSRegTileDistribution<Problem>());
+
+        // V tile in LDS
+        auto v_dram_window = make_tile_window(
+            v_dram_block_window_tmp, Policy::template MakeVDramTileDistribution<Problem>());
+
+        auto v_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType* __restrict__>(static_cast<char*>(smem_ptrv0)),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+
+        auto v_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType* __restrict__>(static_cast<char*>(smem_ptrv0)),
+            Policy::template MakeVLdsBlockDescriptor<Problem, true>());
+
+        auto v_lds_write_window =
+            make_tile_window(v_lds_write_view,
+                             Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto v_lds_read_window =
+            make_tile_window(v_lds_read_view,
+                             make_tuple(number<kK1>{}, number<kN1>{}),
+                             {0, 0},
+                             Policy::template MakeVRegTileDistribution<Problem>());
+
+        // block_sync_lds_direct_load<0>();
+        // auto q_tile = load_tile(q_lds_read_window);
+
+        const index_t num_total_loop =
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+
+        index_t i_total_loops      = 0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        static_assert(1 <= k0_loops);
+        static_assert(1 <= k1_loops);
+        block_sync_lds<0>();
+        async_load_tile(k_lds_write_window, k_dram_window);
+        async_load_tile(v_lds_write_window, v_dram_window);
+
+        move_tile_window(k_dram_window, {kN0, 0});
+        k_lds_write_window.set_bottom_tensor_view_data_ptr(
+            static_cast<KDataType* __restrict__>(smem_ptrk1));
+        async_load_tile(k_lds_write_window, k_dram_window);
+
+        constexpr index_t k_vmem_insts = k_dram_window.get_num_of_access();
+        constexpr index_t v_vmem_insts = v_dram_window.get_num_of_access();
+
+        constexpr index_t k_lds_insts = k_lds_read_window.get_num_of_access();
+        constexpr index_t v_lds_insts = v_lds_read_window.get_num_of_access();
+
+        block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
+        auto k_tile = load_tile(k_lds_read_window);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        auto mainloop = [&](index_t cur_loop) {
+            const bool is_even_loop = (cur_loop % 2 == 0);
+
+            auto k_lds_write_ptr = is_even_loop ? static_cast<KDataType* __restrict__>(smem_ptrk0)
+                                                : static_cast<KDataType* __restrict__>(smem_ptrk1);
+            auto k_lds_read_ptr  = is_even_loop ? static_cast<KDataType* __restrict__>(smem_ptrk1)
+                                                : static_cast<KDataType* __restrict__>(smem_ptrk0);
+            auto v_lds_write_ptr = is_even_loop ? static_cast<VDataType* __restrict__>(smem_ptrv1)
+                                                : static_cast<VDataType* __restrict__>(smem_ptrv0);
+            auto v_lds_read_ptr  = is_even_loop ? static_cast<VDataType* __restrict__>(smem_ptrv0)
+                                                : static_cast<VDataType* __restrict__>(smem_ptrv1);
+
+            // move V tile windows
+            block_sync_lds<k_lds_insts>();
+            move_tile_window(v_dram_window, {kN0, 0});
+            v_lds_write_window.set_bottom_tensor_view_data_ptr(v_lds_write_ptr);
+            async_load_tile(v_lds_write_window, v_dram_window);
+
+            // STAGE 1, QK gemm
+            clear_tile(s_acc); // initialize C
+
+            if constexpr(1 < k0_loops)
+            {
+                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                    // loop over along the [K]ey head dimension
+                    move_tile_window(k_lds_read_window, {0, kK0});
+                    auto k_tile_switch = load_tile(k_lds_read_window);
+
+                    gemm_0(s_acc,
+                           get_slice_tile(q_tile,
+                                          sequence<0, i_k0 * kK0>{},
+                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           k_tile);
+
+                    k_tile = k_tile_switch;
+                });
+                // move back to the origin
+                move_tile_window(k_lds_read_window, {0, -kK0 * (k0_loops - 1)});
+            }
+
+            gemm_0(s_acc,
+                   get_slice_tile(q_tile,
+                                  sequence<0, (k0_loops - 1) * kK0>{},
+                                  sequence<kM0, k0_loops * kK0>{}),
+                   k_tile);
+
+            block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
+            v_lds_read_window.set_bottom_tensor_view_data_ptr(v_lds_read_ptr);
+            auto v_tile = load_tile_transpose(v_lds_read_window);
+
+            if constexpr(kHasUnevenSplits)
+            {
+                if(i_total_loops == (num_total_loop - 1))
+                {
+                    const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+                    set_tile_if(s_acc,
+                                -numeric<SMPLComputeDataType>::infinity(),
+                                [&,
+                                 physical_seqlen_k_start_ = physical_seqlen_k_start,
+                                 physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
+                                    const auto col = k_origin.at(I0) + tile_idx.at(I1);
+
+                                    {
+                                        return physical_seqlen_k_end_ <= col;
+                                    }
+                                });
+                }
+            }
+
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+
+                bool need_perpixel_check =
+                    mask.IsEdgeTile(q_origin.at(I0), k_origin.at(I0), number<kM0>{}, number<kN0>{});
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(I0) + tile_idx.at(I0);
+                            const auto col = k_origin.at(I0) + tile_idx.at(I1);
+                            return mask.IsOutOfBound(row, col);
+                        });
+                }
+            }
+
+            // Gemm1
+            auto s_new = [&]() {
+                if constexpr(kNWarp > 1)
+                {
+                    auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+
+                    store_tile(s_write_lds_window, s);
+                    block_sync_lds();
+                    return load_tile(s_read_lds_window);
+                }
+                else
+                {
+                    return cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+                }
+            }();
+
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s_new,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            block_tile_reduce_sync(
+                m_local, f_max, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            static_for<0, 12, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS_READ
+            });
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS_READ
+            });
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s_new.get_tile_distribution()); // Pcompute{j}
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                auto row_max         = scale_s * get_validated_m(m[i_idx]);
+                sweep_tile_span(p_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
+                        }
+                    }
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(
+                rowsum_p, f_sum, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            auto p_tile = make_static_distributed_tensor<PDataType>(
+                Policy::template MakePRegTileDistribution<Problem>());
+            p_tile.get_thread_buffer() = cast_tile<PDataType>(p_compute).get_thread_buffer();
+
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                const auto tmp       = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
+                    }
+                }();
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            block_sync_lds<v_lds_insts>();
+            move_tile_window(k_dram_window, {kN0, 0});
+            k_lds_write_window.set_bottom_tensor_view_data_ptr(k_lds_write_ptr);
+            async_load_tile(k_lds_write_window, k_dram_window);
+
+            if constexpr(1 < k1_loops)
+            {
+                static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
+                    // loop over along the [V]alue Sequence length
+                    move_tile_window(v_lds_read_window, {kK1, 0});
+                    auto v_tile_switch = load_tile_transpose(v_lds_read_window);
+
+                    gemm_1(o_acc,
+                           get_slice_tile(p_tile,
+                                          sequence<0, i_k1 * kK1>{},
+                                          sequence<kM0, (i_k1 + 1) * kK1>{}),
+                           v_tile);
+
+                    v_tile = v_tile_switch;
+                });
+                // move back to the origin
+                move_tile_window(v_lds_read_window, {-kK1 * (k1_loops - 1), 0});
+            }
+
+            gemm_1(o_acc,
+                   get_slice_tile(p_tile,
+                                  sequence<0, (k1_loops - 1) * kK1>{},
+                                  sequence<kM0, k1_loops * kK1>{}),
+                   v_tile);
+
+            block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
+            k_lds_read_window.set_bottom_tensor_view_data_ptr(k_lds_read_ptr);
+            k_tile = load_tile(k_lds_read_window);
+
+            static_for<0, 12, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS_READ
+            });
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS_READ
+            });
+        };
+
+        do
+        {
+            mainloop(i_total_loops);
+            i_total_loops++;
+        } while(i_total_loops < num_total_loop);
+
+        if constexpr(kStoreLSE)
+        {
+            // store lse acc
+            auto lse_acc = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans();
+            sweep_tile_span(lse_acc_spans[I0], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
+                }
+            });
+
+            if(get_thread_local_1d_id() < kM0)
+            {
+                store_tile(lse_acc_dram_window_tmp, lse_acc);
+            }
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[I0], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        return o_acc;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
new file mode 100644
index 0000000000..6d414ee851
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
@@ -0,0 +1,820 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp"
+
+// can remove all bank conflicts, but drop the performance for some cases
+// Probably it is limited by compiler optimization.
+#define CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD 0
+namespace ck_tile {
+// This pipeline is qkv all located in LDS
+struct BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy
+    : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                          /* AsyncCopy = */ false,
+                                          /* NumPrefetchK = */ 1,
+                                          /* NumPrefetchV = */ 1>
+{
+    using BasePolicy = BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                                           /* AsyncCopy = */ false,
+                                                           /* NumPrefetchK = */ 1,
+                                                           /* NumPrefetchV = */ 1>;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentQ()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
+
+        // this should align with MakeQDramTileDistribution()
+        constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOacc()
+    {
+        using OaccDataType = remove_cvref_t<typename Problem::OaccDataType>;
+
+        return static_cast<index_t>(16 / sizeof(OaccDataType));
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentK()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::KDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentV()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::VDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem, bool BypassLDS = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution()
+    {
+        if constexpr(!BypassLDS)
+        {
+            constexpr index_t kBlockSize = Problem::kBlockSize;
+            constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+            constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+            constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
+
+            constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
+            static_assert(0 < ElemPerThread);
+            constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
+
+            constexpr index_t KPerThread     = kMaxVecLoad;
+            constexpr index_t KThreads       = kKPerBlock / KPerThread;
+            constexpr index_t MThreadPerWarp = get_warp_size() / KThreads;
+            constexpr index_t NumWarps       = kBlockSize / get_warp_size();
+            constexpr index_t MPerThread     = kMPerBlock / (MThreadPerWarp * NumWarps);
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<MPerThread, NumWarps, MThreadPerWarp>,
+                                                 sequence<KThreads, KPerThread>>,
+                                           tuple<sequence<1>, sequence<1, 2>>,
+                                           tuple<sequence<1>, sequence<2, 0>>,
+                                           sequence<1, 2>,
+                                           sequence<0, 1>>{});
+        }
+        else
+        {
+            using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+            constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+            using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+            constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+            constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+            constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+            constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+            constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+            constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+            constexpr auto q_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<NWarp>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                tuple<sequence<1, 0>>,
+                tuple<sequence<1, 0>>,
+                sequence<2, 1>,
+                sequence<0, 0>>{};
+
+            constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+            constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
+
+            return q_block_dstr;
+        }
+    }
+
+    template <typename Problem, bool LoadOnce = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKDramTileDistribution()
+    {
+        using KDataType = remove_cvref_t<typename Problem::KDataType>;
+
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock =
+            LoadOnce ? Problem::BlockFmhaShape::kSubQKHeaddim : Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(KDataType);
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+
+        constexpr index_t K1 = min(MaxVectorSize, ElemPerThread);
+        constexpr index_t K0 = kKPerBlock / K1;
+        constexpr index_t N2 = get_warp_size() / K0;
+        constexpr index_t N1 = kBlockSize / get_warp_size();
+        constexpr index_t N0 = kNPerBlock / (N2 * N1);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read M first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto q_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
+
+        return q_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackQ()
+    {
+        // TODO: this is for 3d layout
+        using QDataType = remove_cvref_t<typename Problem::QDataType>;
+        return static_cast<index_t>(16 / sizeof(QDataType));
+    }
+
+    template <typename Problem, bool Xor = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQLdsBlockDescriptor()
+    {
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t kKPack = GetSmemKPackQ<Problem>();
+
+        constexpr auto q_lds_block_desc = [&]() {
+            if constexpr(Xor)
+            {
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::QDataType);
+                constexpr auto XorLengthFold = LDSLayerSize / kKPerBlock;
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    constexpr auto q_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kMPerBlock / XorLengthFold>{},
+                                   number<LDSLayerSize / kKPack>{},
+                                   number<kKPack>{}),
+                        make_tuple(number<LDSLayerSize>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto q_lds_block_desc_permuted = transform_tensor_descriptor(
+                        q_lds_block_desc_naive,
+                        make_tuple(
+                            make_xor_transform(make_tuple(number<kMPerBlock / XorLengthFold>{},
+                                                          number<LDSLayerSize / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    constexpr auto q_lds_block_desc_tmp = transform_tensor_descriptor(
+                        q_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kMPerBlock / XorLengthFold>{}),
+                            make_unmerge_transform(
+                                make_tuple(number<XorLengthFold>{}, number<kKPerBlock / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_descriptor(
+                        q_lds_block_desc_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kMPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(
+                                make_tuple(number<kMPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    constexpr auto q_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(
+                            number<kMPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
+                        make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto q_lds_block_desc_permuted = transform_tensor_descriptor(
+                        q_lds_block_desc_naive,
+                        make_tuple(make_xor_transform(make_tuple(number<kMPerBlock>{},
+                                                                 number<kKPerBlock / kKPack>{})),
+                                   make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_descriptor(
+                        q_lds_block_desc_permuted,
+                        make_tuple(make_pass_through_transform(number<kMPerBlock>{}),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                    make_tuple(number<kKPerBlock>{}, number<1>{}),
+                    number<kKPack>{},
+                    number<1>{});
+            }
+        }();
+
+        return q_lds_block_desc;
+    }
+
+    template <typename Problem, bool LoadOnce = false, bool Xor = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsBlockDescriptor()
+    {
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock =
+            LoadOnce ? Problem::BlockFmhaShape::kSubQKHeaddim : Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t kKPack = GetSmemKPackK<Problem>();
+
+        constexpr auto k_lds_block_desc = [&]() {
+            if constexpr(Xor)
+            {
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::KDataType);
+                constexpr auto XorLengthFold = LDSLayerSize / kKPerBlock;
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    constexpr auto k_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kNPerBlock / XorLengthFold>{},
+                                   number<LDSLayerSize / kKPack>{},
+                                   number<kKPack>{}),
+                        make_tuple(number<LDSLayerSize>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto k_lds_block_desc_permuted = transform_tensor_descriptor(
+                        k_lds_block_desc_naive,
+                        make_tuple(
+                            make_xor_transform(make_tuple(number<kNPerBlock / XorLengthFold>{},
+                                                          number<LDSLayerSize / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    constexpr auto k_lds_block_desc_tmp = transform_tensor_descriptor(
+                        k_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kNPerBlock / XorLengthFold>{}),
+                            make_unmerge_transform(
+                                make_tuple(number<XorLengthFold>{}, number<kKPerBlock / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_descriptor(
+                        k_lds_block_desc_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kNPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(
+                                make_tuple(number<kNPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    constexpr auto k_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(
+                            number<kNPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
+                        make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto k_lds_block_desc_permuted = transform_tensor_descriptor(
+                        k_lds_block_desc_naive,
+                        make_tuple(make_xor_transform(make_tuple(number<kNPerBlock>{},
+                                                                 number<kKPerBlock / kKPack>{})),
+                                   make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_descriptor(
+                        k_lds_block_desc_permuted,
+                        make_tuple(make_pass_through_transform(number<kNPerBlock>{}),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
+                    make_tuple(number<kKPerBlock>{}, number<1>{}),
+                    number<kKPack>{},
+                    number<1>{});
+            }
+        }();
+
+        return k_lds_block_desc;
+    }
+
+    template <typename Problem, bool Xor = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVLdsBlockDescriptor()
+    {
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t kKPack = GetSmemKPackV<Problem>();
+
+        constexpr auto v_lds_block_desc = [&]() {
+            if constexpr(Xor)
+            {
+                constexpr auto XorGroupSize =
+                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{});
+
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::VDataType);
+                constexpr auto XorLengthFold = LDSLayerSize / kNPerBlock;
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    constexpr auto v_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kKPerBlock / XorLengthFold>{},
+                                   number<LDSLayerSize / XorGroupSize>{},
+                                   number<XorGroupSize>{}),
+                        make_tuple(number<LDSLayerSize>{}, number<XorGroupSize>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto v_lds_block_desc_permuted = transform_tensor_descriptor(
+                        v_lds_block_desc_naive,
+                        make_tuple(
+                            make_xor_transform(make_tuple(number<kKPerBlock / XorLengthFold>{},
+                                                          number<LDSLayerSize / XorGroupSize>{})),
+                            make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    constexpr auto v_lds_block_desc_tmp = transform_tensor_descriptor(
+                        v_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kKPerBlock / XorLengthFold>{}),
+                            make_unmerge_transform(make_tuple(number<XorLengthFold>{},
+                                                              number<kNPerBlock / XorGroupSize>{})),
+                            make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_descriptor(
+                        v_lds_block_desc_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kKPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kNPerBlock / XorGroupSize>{}, number<XorGroupSize>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    constexpr auto v_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kKPerBlock>{},
+                                   number<kNPerBlock / XorGroupSize>{},
+                                   number<XorGroupSize>{}),
+                        make_tuple(number<kNPerBlock>{}, number<XorGroupSize>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto v_lds_block_desc_permuted = transform_tensor_descriptor(
+                        v_lds_block_desc_naive,
+                        make_tuple(make_xor_transform(make_tuple(
+                                       number<kKPerBlock>{}, number<kNPerBlock / XorGroupSize>{})),
+                                   make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_descriptor(
+                        v_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kKPerBlock>{}),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kNPerBlock / XorGroupSize>{}, number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(number<kKPerBlock>{}, number<kNPerBlock>{}),
+                    make_tuple(number<kNPerBlock>{}, number<1>{}),
+                    number<kKPack>{},
+                    number<1>{});
+            }
+        }();
+
+        return v_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetQKBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::QDataType,
+                             typename Problem::KDataType,
+                             typename Problem::SaccDataType,
+                             Problem::kBlockSize,
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN0,
+                                                    Problem::BlockFmhaShape::kK0>,
+                                           typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
+
+        using WarpGemm = WarpGemmDispatcher<typename Problem::QDataType,
+                                            typename Problem::KDataType,
+                                            typename Problem::SaccDataType,
+                                            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
+                                            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
+                                            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
+                                            true>;
+
+        using BlockGemmPolicy =
+            BlockGemmARegBRegCRegV2CustomPolicy<typename Problem::QDataType,
+                                                typename Problem::KDataType,
+                                                typename Problem::SaccDataType,
+                                                typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                                WarpGemm,
+                                                GemmLoopOrder::MNK>;
+
+        return BlockGemmARegBRegCRegV2<GemmProblem, BlockGemmPolicy>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetPVBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::PDataType,
+                             typename Problem::VDataType,
+                             typename Problem::OaccDataType,
+                             Problem::kBlockSize,
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN1,
+                                                    Problem::BlockFmhaShape::kK1>,
+                                           typename Problem::BlockFmhaShape::Gemm1BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm1WarpTile>>;
+
+        using WarpGemm =
+            WarpGemmDispatcher<typename Problem::PDataType,
+                               typename Problem::VDataType,
+                               typename Problem::OaccDataType,
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
+                               true,
+                               false,
+                               false,
+                               ((Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 16 &&
+                                 Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32) ||
+                                (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 32 &&
+                                 Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 16))
+                                   ? WGAttrNumAccessEnum::Double
+                                   : WGAttrNumAccessEnum::Single>;
+
+        using BlockGemmPolicy =
+            BlockGemmARegBRegCRegV2CustomPolicy<typename Problem::PDataType,
+                                                typename Problem::VDataType,
+                                                typename Problem::OaccDataType,
+                                                typename Problem::BlockFmhaShape::Gemm1BlockWarps,
+                                                WarpGemm,
+                                                GemmLoopOrder::KMN>;
+
+        return BlockGemmARegBRegCRegV2<GemmProblem, BlockGemmPolicy>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read N first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto k_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto k_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            k_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        constexpr auto k_block_dstr = make_static_tile_distribution(k_block_dstr_encode);
+
+        return k_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeVDramTileDistribution()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::VDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
+
+        constexpr index_t NPerThread     = kMaxVecLoad;
+        constexpr index_t NThreads       = kNPerBlock / NPerThread;
+        constexpr index_t KThreadPerWarp = get_warp_size() / NThreads;
+        constexpr index_t NumWarps       = kBlockSize / get_warp_size();
+        constexpr index_t KPerThread     = kKPerBlock / (KThreadPerWarp * NumWarps);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<KPerThread, NumWarps, KThreadPerWarp>,
+                                             sequence<NThreads, NPerThread>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakePRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetPVBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read M first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto p_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<2, 1>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto p_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            p_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto p_block_dstr = make_static_tile_distribution(p_block_dstr_encode);
+
+        return p_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetPVBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read N first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto v_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<2, 1>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto v_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            v_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        constexpr auto v_block_dstr =
+            make_static_tile_distribution(typename InputTileDistributionTraits<
+                                          decltype(v_block_dstr_encode),
+                                          typename Problem::VDataType>::TransposedDstrEncode{});
+
+        return v_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemNPackS()
+    {
+        using SDataType = remove_cvref_t<typename Problem::SaccDataType>;
+        return static_cast<index_t>(16 / sizeof(SDataType));
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSLdsBlockDescriptor()
+    {
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kNPack     = GetSmemNPackS<Problem>();
+
+        constexpr auto s_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kNPerBlock / kNPack>{}, number<kMPerBlock>{}, number<kNPack>{}),
+            make_tuple(number<(kMPerBlock + 1) * kNPack>{}, number<kNPack>{}, number<1>{}),
+            number<kNPack>{},
+            number<1>{});
+
+        constexpr auto s_lds_block_desc = transform_tensor_descriptor(
+            s_lds_block_desc_0,
+            make_tuple(
+                make_pass_through_transform(number<kMPerBlock>{}),
+                make_merge_transform(make_tuple(number<kNPerBlock / kNPack>{}, number<kNPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return s_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSRegTileDistribution()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetKVBlockGemm<Problem>())>;
+
+        constexpr auto config   = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG                = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        // static_assert(MWarp == 1, "Check failed!");
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+        constexpr index_t kTileK     = Problem::BlockFmhaShape::kN0;
+
+        // K2 is equal to Impl::kABKPerLane * kKIterPerWarpGemm
+        constexpr index_t K3 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane;
+        constexpr index_t K2 = WG::WarpGemmAttribute::Impl::kABKLane;
+        constexpr index_t K1 = kKPerBlock / (K2 * K3);
+        constexpr index_t K0 = kTileK / kKPerBlock;
+        constexpr index_t M2 = WG::WarpGemmAttribute::Impl::kAMLane;
+        constexpr index_t M1 = MWarp;
+        constexpr index_t M0 = kMPerBlock / (M2 * M1);
+
+        constexpr auto s2_block_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2, K3>>,
+                                       tuple<sequence<1, 0>, sequence<2, 1>>,
+                                       tuple<sequence<1, 0>, sequence<2, 2>>,
+                                       sequence<1, 2, 2, 2>,
+                                       sequence<0, 0, 1, 3>>{};
+
+        constexpr auto s2_block_dstr = make_static_tile_distribution(s2_block_dstr_encoding);
+
+        return s2_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeQ()
+    {
+        return MakeQLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::QDataType);
+    }
+
+    template <typename Problem, bool LoadOnce = false>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeK()
+    {
+        return MakeKLdsBlockDescriptor<Problem, LoadOnce>().get_element_space_size() *
+               sizeof(typename Problem::KDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeV()
+    {
+        return MakeVLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::VDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeS()
+    {
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        return NWarp > 1 ? MakeSLdsBlockDescriptor<Problem>().get_element_space_size() *
+                               sizeof(typename Problem::SaccDataType)
+                         : 0;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        // Alignment on gfx950 is 1280 Bytes
+        // Alignment before gfx950 is 512 Bytes.
+        return max(GetSmemSizeQ<Problem>(),
+                   GetSmemSizeK<Problem>() + GetSmemSizeS<Problem>() + GetSmemSizeV<Problem>());
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 3489d6f9a1..ff1f31edc8 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -364,7 +364,13 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         using KDataType = remove_cvref_t<typename Problem::KDataType>;
         if constexpr(AsyncCopy)
         {
-            return 4 / sizeof(KDataType);
+#if defined(__gfx950__)
+            constexpr index_t MaxLoadSizeInBytes = 4 * 4; // dwordx4
+#else
+            constexpr index_t MaxLoadSizeInBytes = 4; // dword
+#endif
+
+            return MaxLoadSizeInBytes / sizeof(KDataType);
         }
         else
         {
@@ -383,23 +389,31 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackV()
     {
         // TODO: this is for 3d layout
-        using VDataType = remove_cvref_t<typename Problem::VDataType>;
-        return 16 / sizeof(VDataType);
+        using VDataType                = remove_cvref_t<typename Problem::VDataType>;
+        constexpr index_t kBlockSize   = Problem::kBlockSize;
+        constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
+        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+        constexpr index_t kMaxVecLoad =
+            min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
+
+        return kMaxVecLoad;
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentV()
     {
-        using VLayout   = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
-        using VDataType = remove_cvref_t<typename Problem::VDataType>;
+        using VLayout                  = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
+        using VDataType                = remove_cvref_t<typename Problem::VDataType>;
+        constexpr index_t kBlockSize   = Problem::kBlockSize;
+        constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
+        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+        constexpr index_t kMaxVecLoad =
+            min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
+
         if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
-            constexpr index_t kBlockSize   = Problem::kBlockSize;
-            constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
-            constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
-            constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-            constexpr index_t kMaxVecLoad =
-                min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
             constexpr index_t kMinVecLoad = 4 / sizeof(VDataType);
 
             constexpr index_t kVecLoad = ((total_pixels / kMaxVecLoad) >= kMinVecLoad)
@@ -410,7 +424,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         }
         else
         {
-            return 16 / sizeof(VDataType);
+            return kMaxVecLoad;
         }
     }
 
@@ -948,20 +962,19 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             {
                 return WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution<>{};
                 // return
-                // WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+                // WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
                 //         WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<typename
                 //         Problem::PDataType, typename Problem::VDataType>>>{};
             }
             else
             {
-                return WarpGemmMfmaDispatcher<
-                    typename Problem::PDataType,
-                    typename Problem::VDataType,
-                    typename Problem::OaccDataType,
-                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
-                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
-                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
-                    true>{};
+                return WarpGemmDispatcher<typename Problem::PDataType,
+                                          typename Problem::VDataType,
+                                          typename Problem::OaccDataType,
+                                          Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
+                                          Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
+                                          Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
+                                          true>{};
             }
         }();
 
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index a5f9f31d6a..faeb5cf6b3 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -213,7 +213,7 @@ struct MoeSortingKernel
 
     using Hargs = MoeSortingHostArgs;
 
-    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t kBlockSize = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
 
     struct Kargs
@@ -487,8 +487,8 @@ struct MoeSortingKernel
         vector_type* p_buf = reinterpret_cast<vector_type*>(buf);
         auto zero_         = vector_type{0};
 
-        for(long_index_t i = (blockIdx.x - 1) * BLOCK_SIZE + threadIdx.x; i < total_elems;
-            i += (gridDim.x - 1) * BLOCK_SIZE)
+        for(long_index_t i = (blockIdx.x - 1) * kBlockSize + threadIdx.x; i < total_elems;
+            i += (gridDim.x - 1) * kBlockSize)
         {
             p_buf[i] = zero_;
         }
@@ -1419,7 +1419,7 @@ template <typename Problem_>
 struct MoeSortingClearWorkspaceKernel
 {
     using Problem                       = remove_cvref_t<Problem_>;
-    static constexpr index_t BLOCK_SIZE = Problem::BlockSize;
+    static constexpr index_t kBlockSize = Problem::BlockSize;
     static constexpr index_t OCCUPANCY  = Problem::Occu;
 
     using Hargs = MoeSortingHostArgs;
@@ -1461,7 +1461,7 @@ struct MoeSortingClearWorkspaceKernel
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs&) { return get_num_cu() * OCCUPANCY; }
 
-    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(kBlockSize); }
 
     // in byte
     CK_TILE_HOST static constexpr auto GetSmemSize() { return 0; }
@@ -1499,8 +1499,8 @@ struct MoeSortingClearWorkspaceKernel
         vector_type* p_expert_mesh = reinterpret_cast<vector_type*>(kargs.p_expert_mesh);
         auto zero_                 = vector_type{0};
 
-        for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elems;
-            i += gridDim.x * BLOCK_SIZE)
+        for(index_t i = blockIdx.x * kBlockSize + threadIdx.x; i < total_elems;
+            i += gridDim.x * kBlockSize)
         {
             p_expert_mesh[i] = zero_;
         }
@@ -1560,7 +1560,7 @@ struct MoeSortingMultiPhaseKernel_P0
     using WeightType = typename Problem::WeightType;
     using MeshType   = typename Problem::MeshType;
 
-    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t kBlockSize = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
 
     typedef MoeSortingHostArgs MoeSortingKargs;
@@ -1604,7 +1604,7 @@ struct MoeSortingMultiPhaseKernel_P0
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs&) { return get_num_cu() * OCCUPANCY; }
 
-    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(kBlockSize); }
 
     // in byte
     CK_TILE_HOST static constexpr auto GetSmemSize() { return 0; }
@@ -1647,8 +1647,8 @@ struct MoeSortingMultiPhaseKernel_P0
         index_t total_elem = rounded_tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
 
 #pragma unroll Problem::SubTokenTile
-        for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elem;
-            i += gridDim.x * BLOCK_SIZE)
+        for(index_t i = blockIdx.x * kBlockSize + threadIdx.x; i < total_elem;
+            i += gridDim.x * kBlockSize)
         {
             auto x = p_topk_ids[i];
             static_for<0, Problem::SubTokenTile, 1>{}([&](auto j) {
@@ -1678,7 +1678,7 @@ struct MoeSortingMultiPhaseKernel_P1
     using WeightType = typename Problem::WeightType;
     using MeshType   = typename Problem::MeshType;
 
-    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t kBlockSize = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
 
     typedef MoeSortingHostArgs MoeSortingKargs;
@@ -1709,12 +1709,12 @@ struct MoeSortingMultiPhaseKernel_P1
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& h) { return dim3(h.num_experts); }
 
-    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(kBlockSize); }
 
     // in byte
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
-        return BLOCK_SIZE / get_warp_size() * sizeof(IndexType);
+        return kBlockSize / get_warp_size() * sizeof(IndexType);
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -1756,7 +1756,7 @@ struct MoeSortingMultiPhaseKernel_P1
         r_t* p_expert_mesh = reinterpret_cast<r_t*>(
             reinterpret_cast<MeshType*>(kargs.p_expert_mesh) + eid * mesh_stride);
 
-        int loops = (mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        int loops = (mesh_stride / index_pack + kBlockSize - 1) / kBlockSize;
 
         if constexpr(Problem::LocalExpertMasking)
         {
@@ -1768,7 +1768,7 @@ struct MoeSortingMultiPhaseKernel_P1
         index_t cnt = 0; // per-wave cnt
         for(int i = 0; i < loops; i++)
         {
-            int position = i * BLOCK_SIZE + threadIdx.x;
+            int position = i * kBlockSize + threadIdx.x;
             r_t v{0};
             if(position < (mesh_stride / index_pack))
                 v = p_expert_mesh[position];
@@ -1792,7 +1792,7 @@ struct MoeSortingMultiPhaseKernel_P1
         if(threadIdx.x == 0)
         {
             index_t c = 0;
-            for(auto i = 0; i < (BLOCK_SIZE / get_warp_size()); i++)
+            for(auto i = 0; i < (kBlockSize / get_warp_size()); i++)
             {
                 c += s[i];
             }
@@ -1811,7 +1811,7 @@ struct MoeSortingMultiPhaseKernel_P01
     using WeightType = typename Problem::WeightType;
     using MeshType   = typename Problem::MeshType;
 
-    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t kBlockSize = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
 
     typedef MoeSortingHostArgs MoeSortingKargs;
@@ -1878,12 +1878,12 @@ struct MoeSortingMultiPhaseKernel_P01
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs&) { return get_num_cu() * OCCUPANCY; }
 
-    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(kBlockSize); }
 
     CK_TILE_HOST static constexpr auto WGCounts(const Hargs& h)
     {
         index_t total_elem = h.tokens * h.topk / Problem::SubTokenTile;
-        index_t elem_cnt   = (total_elem + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        index_t elem_cnt   = (total_elem + kBlockSize - 1) / kBlockSize;
 
         // no more than grid_size
         return min(elem_cnt, GridSize(h));
@@ -1892,7 +1892,7 @@ struct MoeSortingMultiPhaseKernel_P01
     // in byte
     CK_TILE_HOST static constexpr auto GetSmemSize()
     {
-        return BLOCK_SIZE / get_warp_size() * sizeof(IndexType);
+        return kBlockSize / get_warp_size() * sizeof(IndexType);
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -1921,7 +1921,7 @@ struct MoeSortingMultiPhaseKernel_P01
             if constexpr(Problem::LocalToken)
             {
                 index_t total_elem = rounded_tokens * kargs.topk / Problem::SubTokenTile;
-                index_t elem_cnt   = (total_elem + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                index_t elem_cnt   = (total_elem + kBlockSize - 1) / kBlockSize;
 
                 // no more than grid_size
                 return min(elem_cnt, kargs.wg_count);
@@ -1940,8 +1940,8 @@ struct MoeSortingMultiPhaseKernel_P01
             index_t total_elem = rounded_tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
 
 #pragma unroll Problem::SubTokenTile
-            for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elem;
-                i += BLOCK_SIZE * gridDim.x)
+            for(index_t i = blockIdx.x * kBlockSize + threadIdx.x; i < total_elem;
+                i += kBlockSize * gridDim.x)
             {
                 auto x = p_topk_ids[i];
                 static_for<0, Problem::SubTokenTile, 1>{}([&](auto j) {
@@ -1996,7 +1996,7 @@ struct MoeSortingMultiPhaseKernel_P01
 
                 auto f_sum = [](auto x_, auto y_) { return x_ + y_; };
 
-                int loops = (kargs.mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                int loops = (kargs.mesh_stride / index_pack + kBlockSize - 1) / kBlockSize;
 
                 if constexpr(Problem::LocalExpertMasking)
                 {
@@ -2008,7 +2008,7 @@ struct MoeSortingMultiPhaseKernel_P01
                 index_t cnt = 0; // per-wave cnt
                 for(int i = 0; i < loops; i++)
                 {
-                    int position = i * BLOCK_SIZE + threadIdx.x;
+                    int position = i * kBlockSize + threadIdx.x;
                     r_t v{0};
                     if(position < (kargs.mesh_stride / index_pack))
                         v = p_expert_mesh[position];
@@ -2033,7 +2033,7 @@ struct MoeSortingMultiPhaseKernel_P01
                 if(threadIdx.x == 0)
                 {
                     index_t c = 0;
-                    for(auto i = 0; i < (BLOCK_SIZE / get_warp_size()); i++)
+                    for(auto i = 0; i < (kBlockSize / get_warp_size()); i++)
                     {
                         c += s[i];
                     }
@@ -2055,7 +2055,7 @@ struct MoeSortingMultiPhaseKernel_P2
     using WeightType = typename Problem::WeightType;
     using MeshType   = typename Problem::MeshType;
 
-    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t kBlockSize = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
 
     typedef MoeSortingHostArgs MoeSortingKargs;
@@ -2123,17 +2123,17 @@ struct MoeSortingMultiPhaseKernel_P2
         return dim3(h.num_experts + get_num_cu() * OCCUPANCY);
 #else
         // use 1 block to cumsum
-        return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
+        return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, kBlockSize * 16));
 #endif
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(kBlockSize); }
 
     // in byte
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
-        // return 2 * BLOCK_SIZE * sizeof(IndexType);
-        return (4 + 2 * BLOCK_SIZE / get_warp_size()) * sizeof(IndexType);
+        // return 2 * kBlockSize * sizeof(IndexType);
+        return (4 + 2 * kBlockSize / get_warp_size()) * sizeof(IndexType);
     }
 
     // reduce single pixel within a wave
@@ -2142,7 +2142,7 @@ struct MoeSortingMultiPhaseKernel_P2
         if(blockIdx.x > 0)
         {
 #if MOE_SORTING_FMOE_2D_BUF
-            impl::moe_buf_set_zero_kernel_2d<BLOCK_SIZE>(kargs.p_moe_buf,
+            impl::moe_buf_set_zero_kernel_2d<kBlockSize>(kargs.p_moe_buf,
                                                          kargs.tokens,
                                                          kargs.moe_buf_interm_dim,
                                                          kargs.moe_buf_elem_bytes,
@@ -2150,7 +2150,7 @@ struct MoeSortingMultiPhaseKernel_P2
                                                          gridDim.x - 1);
             return;
 #else
-            impl::moe_buf_set_zero_kernel<BLOCK_SIZE>(
+            impl::moe_buf_set_zero_kernel<kBlockSize>(
                 reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
                 kargs.moe_buf_bytes,
                 blockIdx.x - 1);
@@ -2167,7 +2167,7 @@ struct MoeSortingMultiPhaseKernel_P2
             reinterpret_cast<IndexType*>(kargs.p_total_tokens_post_pad);
         IndexType* p_sorted_expert_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);
 
-        const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        const index_t loops = (kargs.num_experts + kBlockSize - 1) / kBlockSize;
         index_t wave_id     = threadIdx.x / get_warp_size();
         index_t lane_id     = threadIdx.x % get_warp_size();
 
@@ -2176,7 +2176,7 @@ struct MoeSortingMultiPhaseKernel_P2
 
         for(index_t i = 0; i < loops; i++)
         {
-            index_t position = i * BLOCK_SIZE + threadIdx.x;
+            index_t position = i * kBlockSize + threadIdx.x;
             IndexType a_     = 0; // token count for a expert
             IndexType b_     = 0; // mask for a expert
             if(position < kargs.num_experts)
@@ -2221,15 +2221,15 @@ struct MoeSortingMultiPhaseKernel_P2
             if(lane_id == get_warp_size() - 1)
             {
                 s[4 + wave_id]                                = cumsum_a;
-                s[4 + wave_id + BLOCK_SIZE / get_warp_size()] = cumsum_b;
+                s[4 + wave_id + kBlockSize / get_warp_size()] = cumsum_b;
             }
 
             __syncthreads();
 
             // reduce cross wave
-            static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
+            static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
                 IndexType prev_a = s[4 + i_w];
-                IndexType prev_b = s[4 + i_w + BLOCK_SIZE / get_warp_size()];
+                IndexType prev_b = s[4 + i_w + kBlockSize / get_warp_size()];
                 prev_a           = wave_id > i_w ? prev_a : 0; // mask out
                 prev_b           = wave_id > i_w ? prev_b : 0; // mask out
                 cumsum_a += prev_a;
@@ -2240,7 +2240,7 @@ struct MoeSortingMultiPhaseKernel_P2
             cumsum_a += prev_cumsum_a;
             cumsum_b += prev_cumsum_b;
 
-            if(threadIdx.x == BLOCK_SIZE - 1)
+            if(threadIdx.x == kBlockSize - 1)
             {
                 s[2] = cumsum_a; // store the last cumsum
                 s[3] = cumsum_b;
@@ -2297,7 +2297,7 @@ struct MoeSortingMultiPhaseKernel_P3
     using WeightType = typename Problem::WeightType;
     using MeshType   = typename Problem::MeshType;
 
-    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t kBlockSize = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
 
     typedef MoeSortingHostArgs MoeSortingKargs;
@@ -2341,12 +2341,12 @@ struct MoeSortingMultiPhaseKernel_P3
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& h) { return dim3(h.num_experts); }
 
-    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(kBlockSize); }
 
     // in byte
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
-        return (4 + BLOCK_SIZE / get_warp_size()) * sizeof(IndexType);
+        return (4 + kBlockSize / get_warp_size()) * sizeof(IndexType);
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -2391,11 +2391,11 @@ struct MoeSortingMultiPhaseKernel_P3
         }
 
         // cumsum one by one
-        int loops       = (kargs.mesh_stride + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        int loops       = (kargs.mesh_stride + kBlockSize - 1) / kBlockSize;
         int prev_cumsum = 0;
         for(int i = 0; i < loops; i++)
         {
-            int i_token = i * BLOCK_SIZE + threadIdx.x;
+            int i_token = i * kBlockSize + threadIdx.x;
             IndexType x = 0;
             if(i_token < tokens)
             {
@@ -2414,13 +2414,13 @@ struct MoeSortingMultiPhaseKernel_P3
             __syncthreads();
 
             // reduce cross wave
-            static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
+            static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
                 IndexType prev = s[4 + i_w];
                 prev           = wave_id > i_w ? prev : 0; // mask out
                 cumsum += prev;
             });
             cumsum += prev_cumsum; // add previous round cumsum
-            if(threadIdx.x == BLOCK_SIZE - 1)
+            if(threadIdx.x == kBlockSize - 1)
             {
                 s[0] = cumsum;
             }
@@ -2441,7 +2441,7 @@ struct MoeSortingMultiPhaseKernel_P3
             }
         }
 
-        for(index_t i = e_start + prev_cumsum + threadIdx.x; i < e_end; i += BLOCK_SIZE)
+        for(index_t i = e_start + prev_cumsum + threadIdx.x; i < e_end; i += kBlockSize)
         {
 #if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
             p_sorted_token_ids[i] = MOE_SORTING_MOCK_ID(tokens, kargs.topk_mdiv.divisor);
@@ -2457,9 +2457,9 @@ namespace impl {
 // we use dynamic LDS size here
 CK_TILE_HOST constexpr auto moe_sorting_get_smem_size_p23(int num_experts_)
 {
-    constexpr index_t BLOCK_SIZE     = 256; // hardcoded 256
+    constexpr index_t kBlockSize     = 256; // hardcoded 256
     const index_t expert_cumsum_elem = num_experts_ + 1;
-    return (4 + 2 * BLOCK_SIZE / get_warp_size() + expert_cumsum_elem) * sizeof(int);
+    return (4 + 2 * kBlockSize / get_warp_size() + expert_cumsum_elem) * sizeof(int);
 }
 } // namespace impl
 
@@ -2473,7 +2473,7 @@ struct MoeSortingMultiPhaseKernel_P23
     using WeightType = typename Problem::WeightType;
     using MeshType   = typename Problem::MeshType;
 
-    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t kBlockSize = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
 
     typedef MoeSortingHostArgs MoeSortingKargs;
@@ -2563,18 +2563,18 @@ struct MoeSortingMultiPhaseKernel_P23
         return dim3(h.num_experts + get_num_cu() * OCCUPANCY);
 #else
         // use 1 block to cumsum
-        // return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
-        return dim3(h.num_experts + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
+        // return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, kBlockSize * 16));
+        return dim3(h.num_experts + ck_tile::integer_divide_ceil(h.moe_buf_bytes, kBlockSize * 16));
 #endif
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(kBlockSize); }
 
     // only use this at host !
     CK_TILE_HOST static constexpr auto GetSmemSize(const Hargs& h)
     {
         const auto smem_23 = impl::moe_sorting_get_smem_size_p23(h.num_experts);
-        const auto smem_sf = BLOCK_SIZE * 4 * sizeof(IndexType);
+        const auto smem_sf = kBlockSize * 4 * sizeof(IndexType);
         return max(smem_23, smem_sf);
     }
 
@@ -2595,7 +2595,7 @@ struct MoeSortingMultiPhaseKernel_P23
         if(static_cast<index_t>(blockIdx.x) >= kargs.num_experts)
         {
 #if MOE_SORTING_FMOE_2D_BUF
-            impl::moe_buf_set_zero_kernel_2d<BLOCK_SIZE>(kargs.p_moe_buf,
+            impl::moe_buf_set_zero_kernel_2d<kBlockSize>(kargs.p_moe_buf,
                                                          tokens,
                                                          kargs.moe_buf_interm_dim,
                                                          kargs.moe_buf_elem_bytes,
@@ -2603,7 +2603,7 @@ struct MoeSortingMultiPhaseKernel_P23
                                                          gridDim.x - kargs.num_experts);
             return;
 #else
-            impl::moe_buf_set_zero_kernel<BLOCK_SIZE>(
+            impl::moe_buf_set_zero_kernel<kBlockSize>(
                 reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
                 kargs.moe_buf_bytes,
                 blockIdx.x - kargs.num_experts);
@@ -2618,13 +2618,13 @@ struct MoeSortingMultiPhaseKernel_P23
             const IndexType* p_local_expert_mask =
                 static_cast<const IndexType*>(kargs.p_local_expert_mask);
             IndexType* p_expert_cumsum      = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
-            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / get_warp_size();
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * kBlockSize / get_warp_size();
             IndexType* p_total_tokens_post_pad =
                 reinterpret_cast<IndexType*>(kargs.p_total_tokens_post_pad);
             IndexType* p_sorted_expert_ids =
                 reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);
 
-            const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            const index_t loops = (kargs.num_experts + kBlockSize - 1) / kBlockSize;
             index_t wave_id     = threadIdx.x / get_warp_size();
             index_t lane_id     = threadIdx.x % get_warp_size();
 
@@ -2633,7 +2633,7 @@ struct MoeSortingMultiPhaseKernel_P23
 
             for(index_t i = 0; i < loops; i++)
             {
-                index_t position = i * BLOCK_SIZE + threadIdx.x;
+                index_t position = i * kBlockSize + threadIdx.x;
                 IndexType a_     = 0; // token count for a expert
                 IndexType b_     = 0; // mask for a expert
                 if(position < kargs.num_experts)
@@ -2678,15 +2678,15 @@ struct MoeSortingMultiPhaseKernel_P23
                 if(lane_id == get_warp_size() - 1)
                 {
                     s[4 + wave_id]                                = cumsum_a;
-                    s[4 + wave_id + BLOCK_SIZE / get_warp_size()] = cumsum_b;
+                    s[4 + wave_id + kBlockSize / get_warp_size()] = cumsum_b;
                 }
 
                 __syncthreads();
 
                 // reduce cross wave
-                static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
+                static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
                     IndexType prev_a = s[4 + i_w];
-                    IndexType prev_b = s[4 + i_w + BLOCK_SIZE / get_warp_size()];
+                    IndexType prev_b = s[4 + i_w + kBlockSize / get_warp_size()];
                     prev_a           = wave_id > i_w ? prev_a : 0; // mask out
                     prev_b           = wave_id > i_w ? prev_b : 0; // mask out
                     cumsum_a += prev_a;
@@ -2697,7 +2697,7 @@ struct MoeSortingMultiPhaseKernel_P23
                 cumsum_a += prev_cumsum_a;
                 cumsum_b += prev_cumsum_b;
 
-                if(threadIdx.x == BLOCK_SIZE - 1)
+                if(threadIdx.x == kBlockSize - 1)
                 {
                     s[2] = cumsum_a; // store the last cumsum
                     s[3] = cumsum_b;
@@ -2758,7 +2758,7 @@ struct MoeSortingMultiPhaseKernel_P23
             IndexType* s                  = reinterpret_cast<IndexType*>(smem);
             MeshType* p_expert_mesh       = reinterpret_cast<MeshType*>(kargs.p_expert_mesh);
             IndexType* p_sorted_token_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_token_ids);
-            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / get_warp_size();
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * kBlockSize / get_warp_size();
             const WeightType* p_weights     = static_cast<const WeightType*>(kargs.p_weights);
             WeightType* p_sorted_weights    = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);
 
@@ -2795,13 +2795,13 @@ struct MoeSortingMultiPhaseKernel_P23
             constexpr index_t index_pack = Problem::SubTokenTile;              // always packed
             using r_t                    = ext_vector_t<MeshType, index_pack>; // always use int32x4
             using d_t                    = ext_vector_t<index_t, index_pack>;
-            int loops                    = (mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            int loops                    = (mesh_stride / index_pack + kBlockSize - 1) / kBlockSize;
 
             int prev_cumsum = 0;
 
             for(int i = 0; i < loops; i++)
             {
-                int i_token_pack = i * BLOCK_SIZE + threadIdx.x;
+                int i_token_pack = i * kBlockSize + threadIdx.x;
                 r_t x_v          = 0;
                 if(i_token_pack < (tokens + index_pack - 1) / index_pack)
                 {
@@ -2819,7 +2819,7 @@ struct MoeSortingMultiPhaseKernel_P23
 
                     static_for<0, index_pack, 1>{}([&](auto j_) {
                         constexpr auto j = j_.value;
-                        x_r[j]           = reinterpret_cast<MeshType*>(s)[threadIdx.x + j * BLOCK_SIZE];
+                        x_r[j]           = reinterpret_cast<MeshType*>(s)[threadIdx.x + j * kBlockSize];
                     });
                 }
 #else
@@ -2830,7 +2830,7 @@ struct MoeSortingMultiPhaseKernel_P23
 #pragma unroll
                     for(int j = 0; j < index_pack / 2; j++)
                     {
-                        int i_token = i * BLOCK_SIZE * index_pack + threadIdx.x + j * BLOCK_SIZE;
+                        int i_token = i * kBlockSize * index_pack + threadIdx.x + j * kBlockSize;
                         index_t x   = x_d[j];
                         int i_topk  = x - 1;          // topk of this token
                         int i_show  = x != 0 ? 1 : 0; // has this token or not
@@ -2845,13 +2845,13 @@ struct MoeSortingMultiPhaseKernel_P23
                         __syncthreads();
 
                         // reduce cross wave
-                        static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
+                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
                             IndexType prev = s[4 + i_w];
                             prev           = wave_id > i_w ? prev : 0; // mask out
                             cumsum += prev;
                         });
                         cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == BLOCK_SIZE - 1)
+                        if(threadIdx.x == kBlockSize - 1)
                         {
                             s[0] = cumsum;
                         }
@@ -2896,13 +2896,13 @@ struct MoeSortingMultiPhaseKernel_P23
                         __syncthreads();
 
                         // reduce cross wave
-                        static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
+                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
                             IndexType prev = s[4 + i_w];
                             prev           = wave_id > i_w ? prev : 0; // mask out
                             cumsum += prev;
                         });
                         cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == BLOCK_SIZE - 1)
+                        if(threadIdx.x == kBlockSize - 1)
                         {
                             s[0] = cumsum;
                         }
@@ -2912,10 +2912,10 @@ struct MoeSortingMultiPhaseKernel_P23
                         int position = cumsum - cumsum_store;
                         static_for<0, index_pack, 1>{}([&](auto j_) {
                             constexpr auto j = j_.value;
-                            // int i_token = i * BLOCK_SIZE * index_pack + threadIdx.x + j *
-                            // BLOCK_SIZE;
+                            // int i_token = i * kBlockSize * index_pack + threadIdx.x + j *
+                            // kBlockSize;
                             int i_token =
-                                i * BLOCK_SIZE * index_pack + threadIdx.x * index_pack + j;
+                                i * kBlockSize * index_pack + threadIdx.x * index_pack + j;
 
                             if(i_show[j])
                             {
@@ -2932,7 +2932,7 @@ struct MoeSortingMultiPhaseKernel_P23
                         });
 
 #if 0
-                        int i_token = i * BLOCK_SIZE * index_pack + threadIdx.x * 2 + j * BLOCK_SIZE * 2;
+                        int i_token = i * kBlockSize * index_pack + threadIdx.x * 2 + j * kBlockSize * 2;
                         index_t x   = x_d[j];
                         index_t x0  = static_cast<index_t>(x & 0xffff);
                         index_t x1  = static_cast<index_t>(x >> 16);
@@ -2951,13 +2951,13 @@ struct MoeSortingMultiPhaseKernel_P23
                         __syncthreads();
 
                         // reduce cross wave
-                        static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
+                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
                             IndexType prev = s[4 + i_w];
                             prev           = wave_id > i_w ? prev : 0; // mask out
                             cumsum += prev;
                         });
                         cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == BLOCK_SIZE - 1)
+                        if(threadIdx.x == kBlockSize - 1)
                         {
                             s[0] = cumsum;
                         }
@@ -2996,7 +2996,7 @@ struct MoeSortingMultiPhaseKernel_P23
                 }
             }
 
-            for(index_t i = e_start + prev_cumsum + threadIdx.x; i < e_end; i += BLOCK_SIZE)
+            for(index_t i = e_start + prev_cumsum + threadIdx.x; i < e_end; i += kBlockSize)
             {
 #if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
                 p_sorted_token_ids[i] = MOE_SORTING_MOCK_ID(tokens, kargs.topk_mdiv.divisor);
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
index 0c8baaf191..dbd6913cdb 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
@@ -568,7 +568,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                      std::is_same_v<typename Problem::GDataType, ck_tile::bf16_t> &&
                      S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16)
         {
-            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+            return WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
                 WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<wg_ctrl>,
                 2>>{};
         }
@@ -576,7 +576,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                           std::is_same_v<typename Problem::GDataType, ck_tile::int8_t> &&
                           S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32)
         {
-            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+            return WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
                 WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<wg_ctrl>,
                 2>>{};
         }
@@ -695,7 +695,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                      std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
                      S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16)
         {
-            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+            return WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
                 WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<wg_ctrl>,
                 2>>{};
         }
@@ -703,7 +703,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                           std::is_same_v<typename Problem::DDataType, ck_tile::int8_t> &&
                           S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32)
         {
-            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+            return WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
                 WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<wg_ctrl>,
                 2>>{};
         }
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index c9bedd7c53..28273f581d 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -1,5 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
 
 #pragma once
 
@@ -8,6 +8,8 @@
 #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"
@@ -31,6 +33,7 @@
 #include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
@@ -56,9 +59,15 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_16bit_traits.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_8bit_traits.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
new file mode 100644
index 0000000000..8313693d3a
--- /dev/null
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp"
+
+namespace ck_tile {
+
+// This BlockGemm enhanced the control over inst issue order
+// A is block distributed tensor
+// B is block distributed tensor
+// C is block distributed tensor
+template <typename Problem_, typename Policy_>
+struct BlockGemmARegBRegCRegV2
+{
+    private:
+    template <typename PipelineProblem_, typename GemmPolicy_>
+    struct GemmTraits_
+    {
+        using Problem        = remove_cvref_t<PipelineProblem_>;
+        using Policy         = remove_cvref_t<GemmPolicy_>;
+        using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+        using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+        using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+        using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+        static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+        static constexpr index_t MPerBlock = BlockGemmShape::kM;
+        static constexpr index_t NPerBlock = BlockGemmShape::kN;
+        static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+        static constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm               = remove_cvref_t<decltype(config.template at<0>())>;
+
+        static constexpr index_t MWarp        = config.template at<1>();
+        static constexpr index_t NWarp        = config.template at<2>();
+        static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
+        static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
+        static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
+
+        static constexpr auto BlockGemmLoopOrder = Policy::BlockGemmLoopOrder;
+
+        static constexpr index_t KPack = WarpGemm::kKPerThread;
+    };
+
+    public:
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using Traits = GemmTraits_<Problem, Policy>;
+
+    using WarpGemm                           = typename Traits::WarpGemm;
+    using BlockGemmShape                     = typename Traits::BlockGemmShape;
+    static constexpr auto BlockGemmLoopOrder = Traits::BlockGemmLoopOrder;
+
+    using ADataType = remove_cvref_t<typename Traits::ADataType>;
+    using BDataType = remove_cvref_t<typename Traits::BDataType>;
+    using CDataType = remove_cvref_t<typename Traits::CDataType>;
+
+    static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
+    static constexpr index_t MIterPerWarp = Traits::MIterPerWarp;
+    static constexpr index_t NIterPerWarp = Traits::NIterPerWarp;
+
+    static constexpr index_t MWarp            = Traits::MWarp;
+    static constexpr index_t NWarp            = Traits::NWarp;
+    static constexpr bool UseDefaultScheduler = (Problem::NumWaveGroups != 1);
+
+    CK_TILE_DEVICE static constexpr auto MakeABlockDistributionEncode()
+    {
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto a_block_outer_dstr_encoding =
+                tile_distribution_encoding<sequence<NWarp>,
+                                           tuple<sequence<MIterPerWarp>, sequence<KIterPerWarp>>,
+                                           tuple<>,
+                                           tuple<>,
+                                           sequence<1, 2>,
+                                           sequence<0, 0>>{};
+
+            constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+            return a_block_dstr_encode;
+        }
+        else
+        {
+            if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
+            {
+                constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<NWarp>,
+                    tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<1, 0>>,
+                    tuple<sequence<1, 0>>,
+                    sequence<2, 1>,
+                    sequence<0, 0>>{};
+
+                constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+                return a_block_dstr_encode;
+            }
+            else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
+            {
+                constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<NWarp>,
+                    tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<1, 0>>,
+                    tuple<sequence<1, 0>>,
+                    sequence<1, 2>,
+                    sequence<0, 0>>{};
+
+                constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+                return a_block_dstr_encode;
+            }
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeBBlockDistributionEncode()
+    {
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto b_block_outer_dstr_encoding =
+                tile_distribution_encoding<sequence<MWarp>,
+                                           tuple<sequence<NIterPerWarp>, sequence<KIterPerWarp>>,
+                                           tuple<>,
+                                           tuple<>,
+                                           sequence<1, 2>,
+                                           sequence<0, 0>>{};
+            constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+            return b_block_dstr_encode;
+        }
+        else
+        {
+            if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
+            {
+                constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<MWarp>,
+                    tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<0, 1>>,
+                    tuple<sequence<0, 1>>,
+                    sequence<2, 1>,
+                    sequence<0, 0>>{};
+                constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+                return b_block_dstr_encode;
+            }
+            else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
+            {
+                constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<MWarp>,
+                    tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<0, 1>>,
+                    tuple<sequence<0, 1>>,
+                    sequence<1, 2>,
+                    sequence<0, 0>>{};
+                constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+                return b_block_dstr_encode;
+            }
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockDistributionEncode()
+    {
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<MWarp>,
+                tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<>,
+                tuple<>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+
+            return c_block_dstr_encode;
+        }
+        else
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<sequence<1, 2>>,
+                tuple<sequence<1, 1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+
+            return c_block_dstr_encode;
+        }
+    }
+
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockTensor, typename BBlockTensor>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   const ABlockTensor& a_block_tensor,
+                                   const BBlockTensor& b_block_tensor) const
+    {
+        static_assert(std::is_same_v<ADataType, remove_cv_t<typename ABlockTensor::DataType>> &&
+                          std::is_same_v<BDataType, remove_cv_t<typename BBlockTensor::DataType>> &&
+                          std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
+                      "wrong!");
+
+        // check ABC-block-distribution
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(MakeABlockDistributionEncode())>,
+                           remove_cvref_t<decltype(ABlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
+            "A distribution is wrong!");
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(MakeBBlockDistributionEncode())>,
+                           remove_cvref_t<decltype(BBlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
+            "B distribution is wrong!");
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(MakeCBlockDistributionEncode())>,
+                           remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
+            "C distribution is wrong!");
+
+        using AWarpDstr = typename WarpGemm::AWarpDstr;
+        using BWarpDstr = typename WarpGemm::BWarpDstr;
+        using CWarpDstr = typename WarpGemm::CWarpDstr;
+
+        using AWarpTensor = typename WarpGemm::AWarpTensor;
+        using BWarpTensor = typename WarpGemm::BWarpTensor;
+        using CWarpTensor = typename WarpGemm::CWarpTensor;
+
+        constexpr auto a_warp_y_lengths =
+            to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto b_warp_y_lengths =
+            to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+        constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+        constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        // hot loop:
+        if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
+        {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    // read A warp tensor from A Block window
+                    AWarpTensor a_warp_tensor;
+                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
+                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<kIter, nIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+                        CWarpTensor c_warp_tensor;
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                    });
+                });
+            });
+        }
+        else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
+        {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                        // read A warp tensor from A Block window
+                        AWarpTensor a_warp_tensor;
+
+                        a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
+
+                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+                        // read C warp tensor from C block tensor
+                        CWarpTensor c_warp_tensor;
+
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                    });
+                });
+            });
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<MWarp>,
+                tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<>,
+                tuple<>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+            constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+            auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+            return c_block_tensor;
+        }
+        else
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<sequence<1, 2>>,
+                tuple<sequence<1, 1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+            constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+            auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+            return c_block_tensor;
+        }
+    }
+
+    // C = A * B
+    template <typename ABlockTensor, typename BBlockTensor>
+    CK_TILE_DEVICE auto operator()(const ABlockTensor& a_block_tensor,
+                                   const BBlockTensor& b_block_tensor) const
+    {
+        auto c_block_tensor = MakeCBlockTile();
+        operator()(c_block_tensor, a_block_tensor, b_block_tensor);
+        return c_block_tensor;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp
new file mode 100644
index 0000000000..c2cfbc083b
--- /dev/null
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+enum struct GemmLoopOrder
+{
+    KMN,
+    MNK,
+};
+
+template <typename AType_,
+          typename BType_,
+          typename CType_,
+          typename BlockWarps_,
+          typename WarpGemm_,
+          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN>
+struct BlockGemmARegBRegCRegV2CustomPolicy
+{
+    using AType = remove_cvref_t<AType_>;
+    using BType = remove_cvref_t<BType_>;
+    using CType = remove_cvref_t<CType_>;
+
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+
+    static constexpr index_t kMWarps = BlockWarps::at(number<0>{});
+    static constexpr index_t kNWarps = BlockWarps::at(number<1>{});
+    static constexpr index_t kKWarps = BlockWarps::at(number<2>{});
+
+    using WarpGemm = remove_cvref_t<WarpGemm_>;
+
+    static constexpr auto BlockGemmLoopOrder = BlockGemmLoopOrder_;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        return make_tuple(WarpGemm{}, kMWarps, kNWarps);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
index cfbd78967f..d16651da93 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
@@ -54,16 +54,16 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy
                 return make_tuple(WarpGemmMfmaF16F16F32M32N32K16<>{}, 2, 2);
             }
 #else
-            using WG = WarpGemmMfmaDispatcher<ck_tile::half_t,
-                                              ck_tile::half_t,
-                                              float,
-                                              32,
-                                              32,
-                                              16,
-                                              true,
-                                              false,
-                                              false,
-                                              wg_attr_num_access>;
+            using WG = WarpGemmDispatcher<ck_tile::half_t,
+                                          ck_tile::half_t,
+                                          float,
+                                          32,
+                                          32,
+                                          16,
+                                          true,
+                                          false,
+                                          false,
+                                          wg_attr_num_access>;
             return make_tuple(WG{}, 4, 1);
 #endif
         }
@@ -71,16 +71,16 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy
                           std::is_same_v<typename Problem::BDataType, bf16_t> &&
                           std::is_same_v<typename Problem::CDataType, float>)
         {
-            using WG = WarpGemmMfmaDispatcher<ck_tile::bf16_t,
-                                              ck_tile::bf16_t,
-                                              float,
-                                              32,
-                                              32,
-                                              16,
-                                              true,
-                                              false,
-                                              false,
-                                              wg_attr_num_access>;
+            using WG = WarpGemmDispatcher<ck_tile::bf16_t,
+                                          ck_tile::bf16_t,
+                                          float,
+                                          32,
+                                          32,
+                                          16,
+                                          true,
+                                          false,
+                                          false,
+                                          wg_attr_num_access>;
             return make_tuple(WG{}, 4, 1);
         }
         else
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index 9c1ce73eac..fcfbf9635f 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -64,6 +64,7 @@ struct BatchedGemmKernel
     /// functions.
     using UniversalGemmKernel =
         UniversalGemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
+    static constexpr index_t kBlockSize = UniversalGemmKernel::kBlockSize;
 
     using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
     using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
@@ -121,9 +122,16 @@ struct BatchedGemmKernel
         return dim3(TilePartitioner::GridSize(M, N), batch_count, KBatch);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() -> dim3
+    CK_TILE_HOST static auto BlockSize() -> dim3
     {
-        return dim3(UniversalGemmKernel::KernelBlockSize);
+        if(ck_tile::is_wave32())
+        {
+            return dim3(UniversalGemmKernel::kBlockSize / 2);
+        }
+        else
+        {
+            return dim3(UniversalGemmKernel::kBlockSize);
+        }
     }
 
     CK_TILE_HOST static constexpr BatchedGemmKernelArgs
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 079d3972d1..e37b4f36d4 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -113,6 +113,7 @@ struct GemmKernel
 
     static constexpr index_t NumATensor = 1;
     static constexpr index_t NumBTensor = 1;
+    static constexpr index_t kBlockSize = UniversalGemmKernel::kBlockSize;
 
     CK_TILE_HOST static auto GetName() -> const std::string
     {
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp
index 34340008d4..34c4e72b22 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp
@@ -86,6 +86,7 @@ struct GemmKernelMultiD
     /// functions.
     using UniversalGemmKernel =
         UniversalGemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
+    static constexpr index_t kBlockSize = UniversalGemmKernel::kBlockSize;
 
     using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
     using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 477a87d42f..eac7f547c1 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -128,7 +128,7 @@ struct GroupedGemmKernel
     using OffsetTile1DPartitioner = OffsettedTile1DPartitioner<TilePartitioner>;
     using Kernel = GroupedGemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
 
-    static constexpr index_t KernelBlockSize  = GemmPipeline::BlockSize;
+    static constexpr index_t kBlockSize       = GemmPipeline::BlockSize;
     static constexpr bool UsePersistentKernel = GemmPipeline::UsePersistentKernel;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
@@ -155,7 +155,17 @@ struct GroupedGemmKernel
         return group_count * sizeof(GemmTransKernelArg);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() -> dim3 { return dim3(KernelBlockSize); }
+    CK_TILE_HOST static auto BlockSize() -> dim3
+    {
+        if(is_wave32())
+        {
+            return dim3(kBlockSize / 2);
+        }
+        else
+        {
+            return dim3(kBlockSize);
+        }
+    }
 
     /**
      * @brief Get the maximum occupancy grid size for the persistent kernel on the current device.
@@ -166,10 +176,10 @@ struct GroupedGemmKernel
     CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
     {
         using ConstantPointer = const void CK_CONSTANT_ADDRESS_SPACE*;
-        const auto kernel     = kentry<KernelBlockSize, 1, Kernel, ConstantPointer, index_t>;
+        const auto kernel     = kentry<1, Kernel, ConstantPointer, index_t>;
         int occupancy;
         HIP_CHECK_ERROR(
-            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, KernelBlockSize, 0));
+            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, kBlockSize, 0));
         const int grid_size = get_available_compute_units(s) * occupancy;
         return dim3(grid_size, 1, 1);
     }
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
new file mode 100644
index 0000000000..a05e7b2ad0
--- /dev/null
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -0,0 +1,243 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/concat.hpp"
+
+namespace ck_tile {
+
+enum StreamKReductionStrategy : uint32_t
+{
+    /// @brief Workgroups atomically add their results to the C tensor
+    Atomic = 0u,
+    /// @brief For a given tile in the C tensor, one workgroup accumulates results of other
+    /// contributing workgroups
+    Reduction = 1u
+};
+
+/// @brief The Stream K GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref StreamKKernel "StreamKKernel" when creating the kernel
+///      arguments object. It contains all necessary information required to build proper kernel
+///      arguments and launch the kernel on GPU. This structure defines the GEMM problem
+///      configuration by stating all required information like M,N,K sizes and respective strides.
+struct StreamKHostArgs : public ck_tile::UniversalGemmHostArgs<>
+{
+    CK_TILE_HOST explicit StreamKHostArgs(const void* a_ptr_,
+                                          const void* b_ptr_,
+                                          void* c_ptr_,
+                                          index_t M_,
+                                          index_t N_,
+                                          index_t K_,
+                                          index_t stride_A_,
+                                          index_t stride_B_,
+                                          index_t stride_C_,
+                                          StreamKReductionStrategy reduction_strategy_,
+                                          index_t num_sk_blocks_ = -1)
+        : UniversalGemmHostArgs<>({a_ptr_},
+                                  {b_ptr_},
+                                  {/*ds_ptr*/},
+                                  c_ptr_,
+                                  /*k_batch_ =*/1,
+                                  M_,
+                                  N_,
+                                  K_,
+                                  {stride_A_},
+                                  {stride_B_},
+                                  {/*stride_Ds_*/},
+                                  stride_C_),
+          reduction_strategy{reduction_strategy_},
+          num_sk_blocks{num_sk_blocks_}
+    {
+    }
+
+    ck_tile::StreamKReductionStrategy reduction_strategy;
+    index_t num_sk_blocks;
+};
+
+template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
+struct StreamKKernel
+{
+    /// @brief Inject the UniversalGemmKernel base class to support execution of all necessary
+    /// functions.
+    using UniversalGemmKernel =
+        UniversalGemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
+
+    static constexpr index_t kBlockSize = UniversalGemmKernel::kBlockSize;
+
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+
+    /// @brief  Specify the layout configurations for A, B, and C
+    using ALayout = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout = remove_cvref_t<typename GemmPipeline::CLayout>;
+
+    /// @brief  Specify the data type configurations for A, B, and C
+    using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    /// @brief  ALayout and ADataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, ALayout>::value &&
+                      !is_detected<is_tuple, ADataType>::value,
+                  "ALayout and ADataType must be scalars.");
+
+    /// @brief  BLayout and BDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, BLayout>::value &&
+                      !is_detected<is_tuple, BDataType>::value,
+                  "BLayout and BDataType must be scalars.");
+
+    /// @brief  CLayout and CDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, CLayout>::value &&
+                      !is_detected<is_tuple, CDataType>::value,
+                  "CLayout and CDataType must be scalars.");
+
+    struct StreamKKernelArgs : ck_tile::UniversalGemmKernelArgs<>
+    {
+        /// @brief  The strategy used by work groups to compute final results in C tensor.
+        StreamKReductionStrategy reduction_strategy;
+        /// @brief  The number of stream k blocks.
+        index_t num_sk_blocks;
+        /// @brief  A pointer to a buffer in device memory for accumulating partial via reduction
+        /// strategy.
+        void* workspace_ptr;
+        /// @brief  An instance of the TilePartioner class for assisting with mapping workgroups to
+        /// the C tensor.
+        TilePartitioner tile_partitioner;
+    };
+
+    using KernelArgs = StreamKKernelArgs;
+    using Kernel     = StreamKKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        using P_ = GemmPipeline;
+        using WarpTile = typename P_::BlockGemmShape::WarpTile;
+
+        return concat('_', "streamk", gemm_prec_str<ADataType, BDataType>(),
+                      concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock),
+                      concat('x', WarpTile::at(number<0>{}), WarpTile::at(number<1>{}), WarpTile::at(number<2>{})),
+                      concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
+                      concat('x', P_::kPadM, P_::kPadN, P_::kPadK));
+        // clang-format on
+    }
+
+    /// @brief Compute the grid size for the Stream K kernel using the tile_partitioner.
+    /// @return The grid size.
+    CK_TILE_HOST static auto GridSize(const TilePartitioner& tile_partitioner) -> dim3
+    {
+        return tile_partitioner.GridSize();
+    }
+
+    /// @brief Get the maximum occupancy grid size for the persistent kernel on the current device.
+    /// @return The maximum occupancy grid size.
+    /// @note This function queries the maximum occupancy of the kernel using
+    /// `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
+    CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
+    {
+        return UniversalGemmKernel::MaxOccupancyGridSize(s);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() -> dim3
+    {
+        return UniversalGemmKernel::BlockSize();
+    }
+
+    CK_TILE_HOST static StreamKKernelArgs MakeKernelArgs(const StreamKHostArgs& host_args)
+    {
+        index_t occupancy = static_cast<index_t>(Occupancy());
+        index_t num_cu    = static_cast<index_t>(NumCU());
+
+        return StreamKKernelArgs{
+            {host_args.as_ptr,
+             host_args.bs_ptr,
+             host_args.ds_ptr,
+             host_args.e_ptr,
+             host_args.M,
+             host_args.N,
+             host_args.K,
+             host_args.stride_As,
+             host_args.stride_Bs,
+             host_args.stride_Ds,
+             host_args.stride_E,
+             host_args.k_batch},
+            host_args.reduction_strategy,
+            host_args.num_sk_blocks,
+            // The workspace pointer is set to nullptr because we must first
+            // instantiate the TilePartitioner to get the necessary size
+            /*workspace_ptr =*/nullptr,
+            TilePartitioner{
+                host_args.M, host_args.N, host_args.K, num_cu, occupancy, host_args.num_sk_blocks}};
+    }
+
+    CK_TILE_HOST static bool
+    IsSupportedArgument(const typename UniversalGemmKernel::KernelArgs& kargs)
+    {
+        return UniversalGemmKernel::IsSupportedArgument(kargs);
+    }
+
+    /// @brief Computes the buffer size needed to store accumulation results for Stream K.
+    /// @return The buffer size needed.
+    CK_TILE_HOST static uint32_t GetWorkSpaceSize(const StreamKKernelArgs& kargs)
+    {
+        // For reduction, we need to determine the amount of device space for acculumation
+        // results and semaphores.
+        if(kargs.reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction)
+        {
+            return kargs.tile_partitioner.GetWorkSpaceSize(sizeof(CDataType));
+        }
+
+        // Otherwise, no additional space is needed since blocks atomically store their results.
+        return 0;
+    }
+
+    /// @brief Sets the kargs' current workspace_ptr to the given workspace_ptr.
+    /// @note Assumes that the given workspace_ptr points to allocated device memory.
+    CK_TILE_HOST static void SetWorkSpacePointer(StreamKKernelArgs& kargs, void* workspace_ptr)
+    {
+        kargs.workspace_ptr = workspace_ptr;
+    }
+
+    // Temporary placeholder to support the Occupancy() static function.
+    // Since the Occupancy function uses kentry, this class must have an operator() function
+    CK_TILE_DEVICE void operator()(StreamKKernelArgs /*kargs*/) const {}
+
+    private:
+    CK_TILE_HOST static int NumCU()
+    {
+        hipDeviceProp_t dev_prop;
+        hipDevice_t dev;
+        hip_check_error(hipGetDevice(&dev));
+        hip_check_error(hipGetDeviceProperties(&dev_prop, dev));
+        int num_cu = dev_prop.multiProcessorCount;
+
+        return num_cu;
+    }
+
+    /// @brief Computes the occupancy (i.e. maximum number of active blocks per CU) for the kernel
+    /// @return The occupancy
+    /// @note This function queries the maximum occupancy of the kernel using
+    /// `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
+    CK_TILE_HOST static int Occupancy()
+    {
+        int occupancy;
+
+        // Since occupancy of 1 is valid for stream k, we set min_num_block_per_cu to 1
+        constexpr int min_block_per_cu = 1;
+        const auto kernel              = kentry<min_block_per_cu, Kernel, KernelArgs>;
+
+        hip_check_error(
+            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, kBlockSize, 0));
+
+        return occupancy;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
index 1d513faea3..8117d65758 100644
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -196,7 +196,7 @@ struct UniversalGemmKernel
     using ELayout   = remove_cvref_t<typename GemmPipeline::CLayout>;
     using EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
-    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+    static constexpr index_t kBlockSize = GemmPipeline::BlockSize;
 
     // Get the persistent kernel if the pipeline has it available
     struct has_persistent_kernel
@@ -213,6 +213,23 @@ struct UniversalGemmKernel
     };
     static constexpr bool PersistentKernel = has_persistent_kernel::value;
 
+    // Check if TilePartitioner has GetOutputOffset method with kargs and k_id
+    struct has_tile_partitioner_output_offset_impl
+    {
+        template <typename T, typename KernelArgs>
+        using has_get_output_offset_t =
+            decltype(T::GetOutputOffset(std::declval<KernelArgs>(), std::declval<index_t>()));
+
+        static constexpr bool value = []() {
+            if constexpr(is_detected<has_get_output_offset_t, TilePartitioner>{})
+                return true;
+            else
+                return false;
+        }();
+    };
+    static constexpr bool has_tile_partitioner_output_offset =
+        has_tile_partitioner_output_offset_impl::value;
+
     static constexpr auto I0 = number<0>();
     static constexpr auto I1 = number<1>();
     static constexpr auto I2 = number<2>();
@@ -258,15 +275,26 @@ struct UniversalGemmKernel
     CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
     {
         using Kernel      = UniversalGemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
-        const auto kernel = kentry<KernelBlockSize, 1, Kernel, KernelArgs>;
+        const auto kernel = kentry<1, Kernel, KernelArgs>;
         int occupancy;
         hip_check_error(
-            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, KernelBlockSize, 0));
+            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, BlockSize().x, 0));
+
         const int grid_size = get_available_compute_units(s) * occupancy;
         return dim3(grid_size, 1, 1);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+    CK_TILE_HOST static auto BlockSize()
+    {
+        if(ck_tile::is_wave32())
+        {
+            return dim3(kBlockSize / 2);
+        }
+        else
+        {
+            return dim3(kBlockSize);
+        }
+    }
 
     CK_TILE_HOST static constexpr KernelArgs
     MakeKernelArgs(const UniversalGemmHostArgs<NumATensor, NumBTensor, NumDTensor>& hostArgs)
@@ -354,7 +382,9 @@ struct UniversalGemmKernel
             }
         }
 
-        bool AsTesnorIsValid = {true};
+        const auto vectorSizeA = is_wave32() ? GemmPipeline::template GetVectorSizeA<true>()
+                                             : GemmPipeline::template GetVectorSizeA<false>();
+        bool AsTesnorIsValid   = {true};
         static_for<0, NumATensor, 1>{}([&](auto index) {
             using AiLayout = remove_cvref_t<std::tuple_element_t<index.value, AsLayout>>;
             if constexpr(std::is_same_v<AiLayout, tensor_layout::gemm::RowMajor>)
@@ -370,7 +400,7 @@ struct UniversalGemmKernel
                     }
                     AsTesnorIsValid = false;
                 }
-                if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
+                if(kargs.K % vectorSizeA != 0)
                 {
                     if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                     {
@@ -390,7 +420,7 @@ struct UniversalGemmKernel
                     }
                     AsTesnorIsValid = false;
                 }
-                if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
+                if(kargs.M % vectorSizeA != 0)
                 {
                     if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                     {
@@ -401,7 +431,9 @@ struct UniversalGemmKernel
             }
         });
 
-        bool BsTesnorIsValid = {true};
+        bool BsTesnorIsValid   = {true};
+        const auto vectorSizeB = is_wave32() ? GemmPipeline::template GetVectorSizeB<true>()
+                                             : GemmPipeline::template GetVectorSizeB<false>();
         static_for<0, NumBTensor, 1>{}([&](auto index) {
             using BiLayout = remove_cvref_t<std::tuple_element_t<index.value, BsLayout>>;
             if constexpr(std::is_same_v<BiLayout, tensor_layout::gemm::RowMajor>)
@@ -415,7 +447,7 @@ struct UniversalGemmKernel
                     }
                     BsTesnorIsValid = false;
                 }
-                if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
+                if(kargs.N % vectorSizeB != 0)
                 {
                     if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                     {
@@ -437,7 +469,7 @@ struct UniversalGemmKernel
                     }
                     BsTesnorIsValid = false;
                 }
-                if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
+                if(kargs.K % vectorSizeB != 0)
                 {
                     if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                     {
@@ -943,17 +975,15 @@ struct UniversalGemmKernel
         const auto& bs_block_window = gemm_tile_windows.at(I1);
         const auto& ds_block_window = gemm_tile_windows.at(I2);
 
-        const auto& c_block_tile = GemmPipeline{}.template operator()(
-            as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0);
+        const auto& c_block_tile =
+            GemmPipeline{}(as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0);
 
         if(UseDefaultScheduler || (get_warp_id() == 0))
         {
             // Run Epilogue Pipeline
             auto& c_block_window = gemm_tile_windows.at(I3);
 
-            EpiloguePipeline{}.template
-            operator()<decltype(c_block_window), decltype(c_block_tile), decltype(ds_block_window)>(
-                c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
+            EpiloguePipeline{}(c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
         }
     }
 
@@ -1001,15 +1031,13 @@ struct UniversalGemmKernel
         const auto& bs_block_window = gemm_tile_windows.at(I1);
         const auto& ds_block_window = gemm_tile_windows.at(I2);
 
-        const auto& c_block_tile = GemmPipeline{}.template operator()(
+        const auto& c_block_tile = GemmPipeline{}(
             as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0, smem_ptr_1);
 
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(I3);
 
-        EpiloguePipeline{}.template
-        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(ds_block_window)>(
-            c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
+        EpiloguePipeline{}(c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
     }
 
     // Non-persistent kernel entry point
@@ -1036,7 +1064,13 @@ struct UniversalGemmKernel
                         splitk_batch_offset.bs_k_split_offset[i];
         });
 
+        // Calculate output offset from tile partitioner and apply to output pointer
         EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
+        if constexpr(has_tile_partitioner_output_offset)
+        {
+            const index_t output_offset = TilePartitioner::GetOutputOffset(kargs, blockIdx.z);
+            e_ptr += output_offset;
+        }
 
         // allocate LDS
         __shared__ char smem_ptr_0[GetSmemSize()];
@@ -1114,7 +1148,13 @@ struct UniversalGemmKernel
                             splitk_batch_offset.bs_k_split_offset[i];
             });
 
+            // Calculate output offset from tile partitioner and apply to output pointer
             EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
+            if constexpr(has_tile_partitioner_output_offset)
+            {
+                const index_t output_offset = TilePartitioner::GetOutputOffset(kargs, k_batch);
+                e_ptr += output_offset;
+            }
 
             // allocate LDS
             __shared__ char smem_ptr_0[GetSmemSize()];
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index 5b7903a9e7..5f4ee8987e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -127,8 +127,16 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     static constexpr index_t NPerBlock = BlockGemmShape::kN;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
 
-    static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA<Problem>(); }
-    static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeA()
+    {
+        return Policy::template GetVectorSizeA<Problem, IsWave32Host>();
+    }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeB()
+    {
+        return Policy::template GetVectorSizeB<Problem, IsWave32Host>();
+    }
     static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
 
     static constexpr index_t APackedSize =
@@ -182,7 +190,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
         constexpr index_t NPerXDL = BlockGemm::WarpGemm::kN;
         constexpr index_t KPerXDL = BlockGemm::WarpGemm::WarpGemmAttribute::Impl::kK;
 
-        constexpr index_t WaveSize = 64;
+        constexpr index_t WaveSize = get_warp_size();
         constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
         constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
 
@@ -242,7 +250,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             constexpr index_t NPerXDL = BlockGemm::WarpGemm::kN;
             constexpr index_t KPerXDL = BlockGemm::WarpGemm::WarpGemmAttribute::Impl::kK;
 
-            constexpr index_t WaveSize = 64;
+            constexpr index_t WaveSize = get_warp_size();
             constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
             constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index 22c8cf383b..c835809b5d 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -124,8 +124,16 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     static constexpr index_t NPerBlock = BlockGemmShape::kN;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
 
-    static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA<Problem>(); }
-    static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeA()
+    {
+        return Policy::template GetVectorSizeA<Problem, IsWave32Host>();
+    }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeB()
+    {
+        return Policy::template GetVectorSizeB<Problem, IsWave32Host>();
+    }
     static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
 
     static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
@@ -149,7 +157,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
-        return concat('_', "pipeline_AgBgCrCompV3", 
+        return concat('_', "pipeline_AgBgCrCompV4",
                       concat('x', MPerBlock, NPerBlock, KPerBlock,  BlockSize),
                       concat('x', GetVectorSizeA(), GetVectorSizeB(),  GetVectorSizeC()),
                       concat('x', kPadM, kPadN, kPadK));
@@ -182,7 +190,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             constexpr index_t NPerXDL = BlockGemmShape::WarpTile::at(I1{});
             constexpr index_t KPerXDL = BlockGemmShape::WarpTile::at(I2{});
 
-            constexpr index_t WaveSize = 64;
+            constexpr index_t WaveSize = get_warp_size();
             constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
             constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
index 7d88c804f3..a80ed57be5 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
@@ -32,16 +32,17 @@ struct GemmPipelineAgBgCrCompV4DefaultPolicy
                 ? WGAttrNumAccessEnum::Double
                 : WGAttrNumAccessEnum::Single;
 
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                       typename Problem::BDataType,
-                                                       typename Problem::CDataType, // AccDataType
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       Problem::TransposeC,
-                                                       false,
-                                                       false,
-                                                       wg_attr_num_access>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::ADataType,
+                                            typename Problem::BDataType,
+                                            typename Problem::CDataType, // AccDataType
+                                            WarpTile::at(I0),
+                                            WarpTile::at(I1),
+                                            WarpTile::at(I2),
+                                            Problem::TransposeC,
+                                            false,
+                                            false,
+                                            wg_attr_num_access>;
+
         using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::ADataType,
                                                                     typename Problem::BDataType,
                                                                     typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
index 0fdcc04d89..b05145890f 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -61,8 +61,16 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
     static constexpr index_t NPerBlock = BlockGemmShape::kN;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
 
-    static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA<Problem>(); }
-    static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeA()
+    {
+        return Policy::template GetVectorSizeA<Problem, IsWave32Host>();
+    }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeB()
+    {
+        return Policy::template GetVectorSizeB<Problem, IsWave32Host>();
+    }
     static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
 
     static constexpr bool kPadM = Problem::kPadM;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
index 17cd46d560..7065e55e6d 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
@@ -21,15 +21,16 @@ struct GemmPipelineAgBgCrCompV5DefaultPolicy
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
         // using AccDataType     = float;
-        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
-        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                       typename Problem::BDataType,
-                                                       typename Problem::CDataType, // AccDataType
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       Problem::TransposeC>;
+        using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm   = WarpGemmDispatcher<typename Problem::ADataType,
+                                              typename Problem::BDataType,
+                                              typename Problem::CDataType, // AccDataType
+                                              WarpTile::at(I0),
+                                              WarpTile::at(I1),
+                                              WarpTile::at(I2),
+                                              Problem::TransposeC>;
+
         using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::ADataType,
                                                                     typename Problem::BDataType,
                                                                     typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index d62add7ef3..e1acfebc47 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -176,8 +176,16 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
     static constexpr index_t NPerBlock = BlockGemmShape::kN;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
 
-    static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA<Problem>(); }
-    static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeA()
+    {
+        return Policy::template GetVectorSizeA<Problem, IsWave32Host>();
+    }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeB()
+    {
+        return Policy::template GetVectorSizeB<Problem, IsWave32Host>();
+    }
     static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
 
     static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index d8118a7f8f..e3b4863392 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -36,8 +36,16 @@ struct GemmPipelineAGmemBGmemCRegV1
     static constexpr index_t kNPerBlock = BlockGemmShape::kN;
     static constexpr index_t kKPerBlock = BlockGemmShape::kK;
 
-    static constexpr index_t GetVectorSizeA() { return Problem::VectorSizeA; }
-    static constexpr index_t GetVectorSizeB() { return Problem::VectorSizeB; }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeA()
+    {
+        return Problem::VectorSizeA;
+    }
+    template <bool IsWave32Host = false>
+    static constexpr index_t GetVectorSizeB()
+    {
+        return Problem::VectorSizeB;
+    }
     static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; }
 
     static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
index 0560ed9ba9..c8f4cfd4ec 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -390,16 +390,17 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
-        using AccDataType     = float;
-        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
-        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                       typename Problem::ComputeDataType,
-                                                       AccDataType,
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       Problem::TransposeC>;
+        using AccDataType = float;
+        using BlockWarps  = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile    = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm    = WarpGemmDispatcher<typename Problem::ComputeDataType,
+                                               typename Problem::ComputeDataType,
+                                               AccDataType,
+                                               WarpTile::at(I0),
+                                               WarpTile::at(I1),
+                                               WarpTile::at(I2),
+                                               Problem::TransposeC>;
+
         using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
                                                                       typename Problem::BDataType,
                                                                       typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 15f3358aad..40ee952b1b 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -305,11 +305,15 @@ struct UniversalGemmBasePolicy
      * @tparam XPerTile     The contiguous Tile dimension size.
      * @return Maximum DRAM vector load size.
      */
-    template <typename Problem, typename DataType, index_t MNPerBlock, index_t XPerTile>
+    template <typename Problem,
+              typename DataType,
+              index_t MNPerBlock,
+              index_t XPerTile,
+              bool IsWave32Host>
     CK_TILE_HOST_DEVICE static constexpr auto GetGlobalVectorLoadSize()
     {
-        constexpr index_t BlockSize           = Problem::kBlockSize;
-        constexpr index_t KPerBlock           = Problem::BlockGemmShape::kK;
+        constexpr index_t BlockSize = IsWave32Host ? Problem::kBlockSize / 2 : Problem::kBlockSize;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
         constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize;
         constexpr index_t PackedSize =
             ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
@@ -349,7 +353,7 @@ struct UniversalGemmBasePolicy
         }
     }
 
-    template <typename Problem>
+    template <typename Problem, bool IsWave32Host = false>
     CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeA()
     {
         using ALayout               = remove_cvref_t<typename Problem::ALayout>;
@@ -359,15 +363,23 @@ struct UniversalGemmBasePolicy
 
         if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
-            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, KPerBlock>();
+            return GetGlobalVectorLoadSize<Problem,
+                                           ADataType,
+                                           MPerBlock,
+                                           KPerBlock,
+                                           IsWave32Host>();
         }
         else
         {
-            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, MPerBlock>();
+            return GetGlobalVectorLoadSize<Problem,
+                                           ADataType,
+                                           MPerBlock,
+                                           MPerBlock,
+                                           IsWave32Host>();
         }
     }
 
-    template <typename Problem>
+    template <typename Problem, bool IsWave32Host = false>
     CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeB()
     {
         using BLayout               = remove_cvref_t<typename Problem::BLayout>;
@@ -377,11 +389,19 @@ struct UniversalGemmBasePolicy
 
         if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
-            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, NPerBlock>();
+            return GetGlobalVectorLoadSize<Problem,
+                                           BDataType,
+                                           NPerBlock,
+                                           NPerBlock,
+                                           IsWave32Host>();
         }
         else
         {
-            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, KPerBlock>();
+            return GetGlobalVectorLoadSize<Problem,
+                                           BDataType,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           IsWave32Host>();
         }
     }
 
@@ -635,16 +655,17 @@ struct UniversalGemmPipelineAgBgCrPolicy
             : vector_size * 4 == thread_elements              ? WGAttrNumAccessEnum::Quad
                                                               : WGAttrNumAccessEnum::Invalid;
 
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                       typename Problem::ComputeDataType,
-                                                       typename Problem::CDataType,
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       Problem::TransposeC,
-                                                       false,
-                                                       Problem::UseStructuredSparsity,
-                                                       wg_attr_num_access>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::ComputeDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::CDataType,
+                                            WarpTile::at(I0),
+                                            WarpTile::at(I1),
+                                            WarpTile::at(I2),
+                                            Problem::TransposeC,
+                                            false,
+                                            Problem::UseStructuredSparsity,
+                                            wg_attr_num_access>;
+
         using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
                                                                       typename Problem::BDataType,
                                                                       typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
index 83555e5295..f28208df52 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
@@ -280,13 +280,13 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
     {
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm   = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                  typename Problem::BDataType,
-                                                  typename Problem::CDataType,
-                                                  WarpTile::at(I0),
-                                                  WarpTile::at(I1),
-                                                  WarpTile::at(I2),
-                                                  Problem::TransposeC>;
+        using WarpGemm   = WarpGemmDispatcher<typename Problem::ADataType,
+                                              typename Problem::BDataType,
+                                              typename Problem::CDataType,
+                                              WarpTile::at(I0),
+                                              WarpTile::at(I1),
+                                              WarpTile::at(I2),
+                                              Problem::TransposeC>;
 
         using BlockWeightPreshufflePolicy =
             BlockWeightPreshuffleASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
index cadd77a61f..b91c211d91 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
@@ -59,13 +59,15 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
     static constexpr index_t flatKPerWarp = BlockGemmShape::flatKPerWarp;
     static constexpr index_t flatNPerWarp = BlockGemmShape::flatNPerWarp;
 
+    template <bool IsWave32Host = false>
     static constexpr index_t GetVectorSizeA()
     {
-        return PipelinePolicy::template GetVectorSizeA<Problem>();
+        return PipelinePolicy::template GetVectorSizeA<Problem, IsWave32Host>();
     }
+    template <bool IsWave32Host = false>
     static constexpr index_t GetVectorSizeB()
     {
-        return PipelinePolicy::template GetVectorSizeB<Problem>();
+        return PipelinePolicy::template GetVectorSizeB<Problem, IsWave32Host>();
     }
 
     static constexpr bool kPadM = Problem::kPadM;
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
index 9c0f257e8e..c507d8d8d8 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
@@ -76,13 +76,15 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
     static constexpr index_t flatKPerWarp = BlockGemmShape::flatKPerWarp;
     static constexpr index_t flatNPerWarp = BlockGemmShape::flatNPerWarp;
 
+    template <bool IsWave32Host = false>
     static constexpr index_t GetVectorSizeA()
     {
-        return PipelinePolicy::template GetVectorSizeA<Problem>();
+        return PipelinePolicy::template GetVectorSizeA<Problem, IsWave32Host>();
     }
+    template <bool IsWave32Host = false>
     static constexpr index_t GetVectorSizeB()
     {
-        return PipelinePolicy::template GetVectorSizeB<Problem>();
+        return PipelinePolicy::template GetVectorSizeB<Problem, IsWave32Host>();
     }
 
     static constexpr bool kPadM = Problem::kPadM;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index fb191d565d..c42874ca55 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -15,19 +15,19 @@ namespace ck_tile {
 // fp16
 
 using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2,
     AttrNumAccess>>;
@@ -36,42 +36,42 @@ using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterate
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
     2,
     AttrNumAccess>>;
 #endif
 
-using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
     1>>;
 
-using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2>>;
 
 using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
         AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2,
         AttrNumAccess>>;
@@ -80,13 +80,13 @@ using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
         AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2,
         AttrNumAccess>>;
@@ -94,32 +94,36 @@ using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
 
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
         1>>;
 
 using WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
         1>>;
 #endif
 
+using WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
 #else
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 #endif
 
-using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M4N64K4<WGAttrCtlEnum::Default_>,
     4>>;
 
-using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M64N4K4<WGAttrCtlEnum::Default_>,
     4>>;
 
@@ -132,19 +136,19 @@ using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmf
 
 // bf16
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2,
     AttrNumAccess>>;
@@ -153,43 +157,43 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaItera
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
     2,
     AttrNumAccess>>;
 #endif
 
-using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
     1>>;
 
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
 using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
         AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2,
         AttrNumAccess>>;
@@ -198,149 +202,153 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
         AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2,
         AttrNumAccess>>;
 #endif
 
+using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
 #else
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 #endif
 
-using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4<WGAttrCtlEnum::Default_>,
     4>>;
 
-using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4<WGAttrCtlEnum::Default_>,
     4>>;
 
 // fp8
 
 using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_32x32x32_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfma_f32_32x32x32_fp8_fp8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>,
     2>>;
 
-using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>,
     2>>;
 
 using WarpGemmMfma_f32_16x16x32_fp8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_16x16x32_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>,
     2>>;
 
-using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>,
     2>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_16x16x128_fp8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_16x16x128_fp8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_fp8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_fp8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_bf8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
 template <index_t swizzle_factor = 2>
 using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, WGAttrCtlEnum::Default_>,
         2,
         swizzle_factor>>;
 
 // int8
 using WarpGemmMfma_i32_32x32x16_i8_i8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_i32_16x16x32_i8_i8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index 97fab489ab..36a9955912 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -19,7 +19,7 @@ enum class WGAttrNumAccessEnum
 
 template <typename WarpGemmAttributeMfmaImpl_,
           WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
-struct WarpGemmAtrributeMfma
+struct WarpGemmAttributeMfma
 {
     using Impl                           = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
     static constexpr auto AttrNumAccess  = AttrNumAccess_;
@@ -103,7 +103,7 @@ struct WarpGemmAtrributeMfma
 template <typename WarpGemmAttributeMfmaImpl_,
           index_t kKIter,
           WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
-struct WarpGemmAtrributeMfmaIterateK
+struct WarpGemmAttributeMfmaIterateK
 {
     static_assert(kKIter > 0, "wrong!");
 
@@ -367,7 +367,7 @@ struct WarpGemmAtrributeMfmaIterateK
 
 template <typename WarpGemmAttributeMfmaImpl_,
           WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
-struct WarpGemmAtrributeMfmaTransposedCDistribution
+struct WarpGemmAttributeMfmaTransposedCDistribution
 {
     using Impl                           = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
     static constexpr auto AttrNumAccess  = AttrNumAccess_;
@@ -450,7 +450,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
 };
 
 template <typename WarpGemmAttributeMfmaImpl_, index_t SFactor_ = 2>
-struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
+struct WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB
 {
     using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
 
@@ -546,7 +546,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
 template <typename WarpGemmAttributeMfmaImpl_,
           index_t kKIter,
           WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
-struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
+struct WarpGemmAttributeMfmaIterateKAndTransposedCDistribution
 {
     using Impl                          = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
     static constexpr auto AttrNumAccess = AttrNumAccess_;
@@ -574,13 +574,13 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
 
     CK_TILE_DEVICE static constexpr auto get_awarp_dstr_encoding()
     {
-        return WarpGemmAtrributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
+        return WarpGemmAttributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
             get_bwarp_dstr_encoding();
     }
 
     CK_TILE_DEVICE static constexpr auto get_bwarp_dstr_encoding()
     {
-        return WarpGemmAtrributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
+        return WarpGemmAttributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
             get_awarp_dstr_encoding();
     }
 
@@ -696,7 +696,7 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
 };
 
 template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter, index_t SFactor_ = 2>
-struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
+struct WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB
 {
     using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
 
@@ -840,7 +840,7 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
 };
 
 template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter, index_t SFactor_ = 2>
-struct WarpGemmAtrributeMfmaIterateK_SwizzleA
+struct WarpGemmAttributeMfmaIterateK_SwizzleA
 {
     using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
new file mode 100644
index 0000000000..0f021c62f2
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp"
+
+namespace ck_tile {
+
+// TODO: currently only support 16 bit input, which means only support tr16_b128; will use ADataType
+// to determine the layout in the future
+template <typename Impl>
+struct AWarpDstrEncodingTrait
+{
+    using type = tile_distribution_encoding<
+        sequence<Impl::kRepeat>,
+        tuple<sequence<Impl::kAMLane>,
+              sequence<Impl::kABK0PerLane, Impl::kABKLane, Impl::kABK1PerLane>>,
+        tuple<typename Impl::kABPs2RHssMajor>,
+        tuple<typename Impl::kABPs2RHssMinor>,
+        typename Impl::kABYs2RHsMajor,
+        typename Impl::kABYs2RHsMinor>;
+};
+
+template <typename Impl>
+struct BWarpDstrEncodingTrait
+{
+    using type = tile_distribution_encoding<
+        sequence<Impl::kRepeat>,
+        tuple<sequence<Impl::kBNLane>,
+              sequence<Impl::kABK0PerLane, Impl::kABKLane, Impl::kABK1PerLane>>,
+        tuple<typename Impl::kABPs2RHssMajor>,
+        tuple<typename Impl::kABPs2RHssMinor>,
+        typename Impl::kABYs2RHsMajor,
+        typename Impl::kABYs2RHsMinor>;
+};
+
+template <typename Impl>
+struct CWarpDstrEncodingTrait
+{
+    using type = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>,
+              sequence<Impl::kCNLane>>,
+        tuple<typename Impl::kCPs2RHssMajor>,
+        tuple<typename Impl::kCPs2RHssMinor>,
+        typename Impl::kCYs2RHsMajor,
+        typename Impl::kCYs2RHsMinor>;
+};
+
+template <typename WarpGemmAttributeWmmaImpl_, bool kTransC = false>
+struct WarpGemmAttributeWmma
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeWmmaImpl_>;
+
+    using ADataType = typename Impl::ADataType;
+    using BDataType = typename Impl::BDataType;
+    using CDataType = typename Impl::CDataType;
+
+    using AVecType = typename Impl::AVecType;
+    using BVecType = typename Impl::BVecType;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM          = Impl::kM;
+    static constexpr index_t kN          = Impl::kN;
+    static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kKPerThread = Impl::kABK0PerLane * Impl::kABK1PerLane;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
+
+    // 16 bit input, kAMLane = 16, kABK0PerLane = 4, kABKLane = 2, kABK1PerLane = 2
+    // 8  bit input, kAMLane = 16, kABK0PerLane = 2, kABKLane = 2, kABK1PerLane = 4
+    using AWarpDstrEncoding = typename AWarpDstrEncodingTrait<Impl>::type;
+    using BWarpDstrEncoding = typename BWarpDstrEncodingTrait<Impl>::type;
+
+    // kCM0PerLane = 4, kCMLane = 2, kCM1PerLane = 2, kCNLane = 16 for 16 bit input
+    // kCM0PerLane = 2, kCMLane = 2, kCM1PerLane = 4, kCNLane = 16 for 8 bit input
+    using CWarpDstrEncoding = typename CWarpDstrEncodingTrait<Impl>::type;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        if constexpr(kTransC)
+        {
+            Impl{}(c_vec, b_vec, a_vec, bool_constant<post_nop_>{});
+        }
+        else
+        {
+            Impl{}(c_vec, a_vec, b_vec, bool_constant<post_nop_>{});
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        if constexpr(kTransC)
+        {
+            return Impl{}(b_vec, a_vec);
+        }
+        else
+        {
+            return Impl{}(a_vec, b_vec);
+        }
+    }
+};
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          index_t M_Warp_Tile,
+          index_t N_Warp_Tile,
+          index_t K_Warp_Tile>
+CK_TILE_HOST bool check_wmma_supported()
+{
+    if(is_gfx12_supported())
+    {
+        return has_wmma_traits_v<gfx12_t,
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 M_Warp_Tile,
+                                 N_Warp_Tile,
+                                 K_Warp_Tile>;
+    }
+    else if(is_gfx11_supported())
+    {
+        return has_wmma_traits_v<gfx11_t,
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 M_Warp_Tile,
+                                 N_Warp_Tile,
+                                 K_Warp_Tile>;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp
new file mode 100644
index 0000000000..88fde40067
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/core/config.hpp"
+
+namespace ck_tile {
+
+// Base traits for WMMA operations
+template <typename Arch,
+          typename AType,
+          typename BType,
+          typename CType,
+          index_t M,
+          index_t N,
+          index_t K>
+struct WmmaTraits;
+
+// Generic WMMA implementation using traits
+template <typename Traits>
+struct WarpGemmAttributeWmmaImpl
+{
+    using ADataType = typename Traits::ADataType;
+    using BDataType = typename Traits::BDataType;
+    using CDataType = typename Traits::CDataType;
+
+    using AVecType = typename Traits::AVecType;
+    using BVecType = typename Traits::BVecType;
+    using CVecType = typename Traits::CVecType;
+
+    // Forward all static constants and type aliases
+    static constexpr index_t kM = Traits::kM;
+    static constexpr index_t kN = Traits::kN;
+    static constexpr index_t kK = Traits::kK;
+
+    static constexpr index_t kAMBlock = Traits::kAMBlock;
+    static constexpr index_t kBNBlock = Traits::kBNBlock;
+
+    static constexpr index_t kRepeat      = Traits::kRepeat;
+    static constexpr index_t kAMLane      = Traits::kAMLane;
+    static constexpr index_t kBNLane      = Traits::kBNLane;
+    static constexpr index_t kABK0PerLane = Traits::kABK0PerLane;
+    static constexpr index_t kABKLane     = Traits::kABKLane;
+    static constexpr index_t kABK1PerLane = Traits::kABK1PerLane;
+
+    static constexpr index_t kCMLane     = Traits::kCMLane;
+    static constexpr index_t kCNLane     = Traits::kCNLane;
+    static constexpr index_t kCM0PerLane = Traits::kCM0PerLane;
+    static constexpr index_t kCM1PerLane = Traits::kCM1PerLane;
+
+    using kABPs2RHssMajor = typename Traits::kABPs2RHssMajor;
+    using kABPs2RHssMinor = typename Traits::kABPs2RHssMinor;
+    using kABYs2RHsMajor  = typename Traits::kABYs2RHsMajor;
+    using kABYs2RHsMinor  = typename Traits::kABYs2RHsMinor;
+
+    using kCPs2RHssMajor = typename Traits::kCPs2RHssMajor;
+    using kCPs2RHssMinor = typename Traits::kCPs2RHssMinor;
+    using kCYs2RHsMajor  = typename Traits::kCYs2RHsMajor;
+    using kCYs2RHsMinor  = typename Traits::kCYs2RHsMinor;
+
+    // c_vec += a_vec * b_vec
+    template <bool clamp = false, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        c_vec = Traits::template wmma_intrinsic<clamp>(a_vec, b_vec, c_vec);
+    }
+
+    // c_vec = a_vec * b_vec
+    template <bool clamp = false>
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        return bit_cast<CVecType>(
+            Traits::template wmma_intrinsic<clamp>(a_vec, b_vec, CVecType{0.f}));
+    }
+};
+
+using DeviceIp = remove_cvref_t<decltype(ck_tile::get_device_arch())>;
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<DeviceIp, fp16_t, fp16_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<DeviceIp, bf16_t, bf16_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<DeviceIp, int8_t, int8_t, int32_t, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, fp8_t, fp8_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, bf8_t, bf8_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, fp8_t, bf8_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, bf8_t, fp8_t, float, 16, 16, 16>>;
+
+template <typename Arch,
+          typename AType,
+          typename BType,
+          typename CType,
+          index_t warp_m,
+          index_t warp_n,
+          index_t warp_k>
+struct has_wmma_traits
+{
+    template <typename T>
+    static auto
+    test(int) -> decltype(std::declval<
+                              typename WmmaTraits<T, AType, BType, CType, warp_m, warp_n, warp_k>::
+                                  ADataType>(),
+                          std::true_type{});
+
+    template <typename>
+    static std::false_type test(...);
+
+    static constexpr bool value = decltype(test<Arch>(0))::value;
+};
+
+template <typename Arch,
+          typename AType,
+          typename BType,
+          typename CType,
+          index_t warp_m,
+          index_t warp_n,
+          index_t warp_k>
+constexpr bool has_wmma_traits_v =
+    has_wmma_traits<Arch, AType, BType, CType, warp_m, warp_n, warp_k>::value;
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_16bit_traits.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_16bit_traits.hpp
new file mode 100644
index 0000000000..7e834d9add
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_16bit_traits.hpp
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "warp_gemm_attribute_wmma_impl_base_traits.hpp"
+namespace ck_tile {
+// fp16 specialization - GFX11
+template <>
+struct WmmaTraits<gfx11_t, fp16_t, fp16_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx11_t, fp16_t, fp16_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx11__
+        return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_vec, b_vec, c_vec);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+// bf16 specialization - GFX11
+template <>
+struct WmmaTraits<gfx11_t, bf16_t, bf16_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx11_t, bf16_t, bf16_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx11__
+        return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_vec, b_vec, c_vec);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+// fp16 specialization - GFX12
+template <>
+struct WmmaTraits<gfx12_t, fp16_t, fp16_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, fp16_t, fp16_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_vec, b_vec, c_vec);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+// bf16 specialization - GFX12
+template <>
+struct WmmaTraits<gfx12_t, bf16_t, bf16_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, bf16_t, bf16_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_vec, b_vec, c_vec);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_8bit_traits.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_8bit_traits.hpp
new file mode 100644
index 0000000000..81ff5af2fe
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_8bit_traits.hpp
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "warp_gemm_attribute_wmma_impl_base_traits.hpp"
+namespace ck_tile {
+// int8 specialization - GFX11
+template <>
+struct WmmaTraits<gfx11_t, int8_t, int8_t, int32_t, 16, 16, 16>
+    : WmmaTraitsBase<gfx11_t, int8_t, int8_t, int32_t>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx11__
+        return __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, // neg_a
+                                                          bit_cast<int32x4_t>(a_vec),
+                                                          true, // neg_b
+                                                          bit_cast<int32x4_t>(b_vec),
+                                                          bit_cast<int32x8_t>(c_vec),
+                                                          clamp);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+// int8 specialization - GFX12
+template <>
+struct WmmaTraits<gfx12_t, int8_t, int8_t, int32_t, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, int8_t, int8_t, int32_t>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, // neg_a
+                                                                bit_cast<int32x2_t>(a_vec),
+                                                                true, // neg_b
+                                                                bit_cast<int32x2_t>(b_vec),
+                                                                bit_cast<int32x8_t>(c_vec),
+                                                                clamp);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+// fp8/bf8 specialization - GFX12
+template <>
+struct WmmaTraits<gfx12_t, fp8_t, fp8_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, fp8_t, fp8_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(
+            bit_cast<int32x2_t>(a_vec), bit_cast<int32x2_t>(b_vec), bit_cast<fp32x8_t>(c_vec));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+template <>
+struct WmmaTraits<gfx12_t, bf8_t, bf8_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, bf8_t, bf8_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(
+            bit_cast<int32x2_t>(a_vec), bit_cast<int32x2_t>(b_vec), bit_cast<fp32x8_t>(c_vec));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+template <>
+struct WmmaTraits<gfx12_t, fp8_t, bf8_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, fp8_t, bf8_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(
+            bit_cast<int32x2_t>(a_vec), bit_cast<int32x2_t>(b_vec), bit_cast<fp32x8_t>(c_vec));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+template <>
+struct WmmaTraits<gfx12_t, bf8_t, fp8_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, bf8_t, fp8_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(
+            bit_cast<int32x2_t>(a_vec), bit_cast<int32x2_t>(b_vec), bit_cast<fp32x8_t>(c_vec));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
new file mode 100644
index 0000000000..7a3190e6f4
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+namespace ck_tile {
+template <typename Arch, typename ADType, typename BDType, typename CDType>
+struct WmmaTraitsBase;
+
+// GFX11 specialization
+template <typename ADType, typename BDType, typename CDType>
+struct WmmaTraitsBase<gfx11_t, ADType, BDType, CDType>
+{
+    using ADataType = ADType;
+    using BDataType = BDType;
+    using CDataType = CDType;
+
+    using AVecType = ext_vector_t<ADataType, 16>;
+    using BVecType = ext_vector_t<BDataType, 16>;
+    using CVecType = ext_vector_t<CDataType, 8>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kRepeat      = 2;
+    static constexpr index_t kAMLane      = 16;
+    static constexpr index_t kBNLane      = 16;
+    static constexpr index_t kABK0PerLane = 1;
+    static constexpr index_t kABKLane     = 1;
+    static constexpr index_t kABK1PerLane = 16;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 8;
+    static constexpr index_t kCM1PerLane = 1;
+
+    using kABPs2RHssMajor = sequence<0, 2, 1>;
+    using kABPs2RHssMinor = sequence<0, 1, 0>;
+    using kABYs2RHsMajor  = sequence<2, 2>;
+    using kABYs2RHsMinor  = sequence<0, 2>;
+
+    using kCPs2RHssMajor = sequence<1, 2>;
+    using kCPs2RHssMinor = sequence<1, 0>;
+    using kCYs2RHsMajor  = sequence<1, 1>;
+    using kCYs2RHsMinor  = sequence<0, 2>;
+};
+
+// GFX12 specialization
+template <typename ADType, typename BDType, typename CDType>
+struct WmmaTraitsBase<gfx12_t, ADType, BDType, CDType>
+{
+    using ADataType = ADType;
+    using BDataType = BDType;
+    using CDataType = CDType;
+
+    using AVecType = ext_vector_t<ADataType, 8>;
+    using BVecType = ext_vector_t<BDataType, 8>;
+    using CVecType = ext_vector_t<CDataType, 8>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kRepeat      = 1;
+    static constexpr index_t kAMLane      = 16;
+    static constexpr index_t kBNLane      = 16;
+    static constexpr index_t kABK0PerLane = 2;
+    static constexpr index_t kABKLane     = 2;
+    static constexpr index_t kABK1PerLane = 4;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 8;
+
+    using kABPs2RHssMajor = sequence<2, 1>;
+    using kABPs2RHssMinor = sequence<1, 0>;
+    using kABYs2RHsMajor  = sequence<2, 2>;
+    using kABYs2RHsMinor  = sequence<0, 2>;
+
+    using kCPs2RHssMajor = sequence<1, 2>;
+    using kCPs2RHssMinor = sequence<1, 0>;
+    using kCYs2RHsMajor  = sequence<1, 1>;
+    using kCYs2RHsMinor  = sequence<0, 2>;
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index e91d505c8e..d50b208946 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -5,6 +5,7 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp"
 
 namespace ck_tile {
 
@@ -19,111 +20,133 @@ template <typename AType,
           bool SwizzleA                     = false,
           bool UseStructuredSparsity        = false,
           WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-struct WarpGemmMfmaDispatcher;
+struct WarpGemmDispatcher;
 
 // clang-format off
 // fp16
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true>  { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16<>; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true>  { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution<>; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaF16F16F32M32N32K16<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32<>; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true>  { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaF16F16F32M16N16K32<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 4, 64, 16, false> { using Type = WarpGemmMfmaF16F16F32M4N64K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 64, 4, 16, false> { using Type = WarpGemmMfmaF16F16F32M64N4K16; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float,  4, 64, 16, false> { using Type = WarpGemmMfmaF16F16F32M4N64K16; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 64,  4, 16, false> { using Type = WarpGemmMfmaF16F16F32M64N4K16; };
+// WMMA cases
+#if defined(__gfx11__) || defined(__gfx12__)
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, TransposeC, false> { using Type = WarpGemmWmma_f32_16x16x16_f16_f16<TransposeC>;};
+#else
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true>  { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
+#endif
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp16 2:4 structural sparsity
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M16N16K32; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M32N32K16; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M16N16K32; };
 
 // bf16
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true>  { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true>  { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaBf16Bf16F32M32N32K16<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true>  { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaBf16Bf16F32M16N16K32<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 4, 64, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M4N64K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 64, 4, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M64N4K16; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float,  4, 64, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M4N64K16; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 64,  4, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M64N4K16; };
+// WMMA cases
+#if defined(__gfx11__) || defined(__gfx12__)
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, TransposeC, false> { using Type = WarpGemmWmma_f32_16x16x16_bf16_bf16<TransposeC>; };
+#else
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
+#endif
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp8
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_fp8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_bf8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_bf8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_bf8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<>; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<WGAttrNumAccessEnum::Quad>; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<WGAttrNumAccessEnum::Quad>; };
+//WMMA cases
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_f8_f8<TransposeC>; };
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_bf8_bf8<TransposeC>; };
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_f8_bf8<TransposeC>; };
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_bf8_f8<TransposeC>; };
+
 // int8
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, false> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, true> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, false> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, true> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, false> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8; };
+template<> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, true>  { using Type = WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, false> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8; };
+template<> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, true>  { using Type = WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed; };
+// WMMA cases
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, int32_t, 16, 16, 16, TransposeC, false> { using Type = WarpGemmWmma_i32_16x16x16_i8_i8<TransposeC>;};
 
 // clang-format on
 } // namespace impl
@@ -138,15 +161,15 @@ template <typename AType,
           bool SwizzleA                     = false,
           bool UseStructuredSparsity        = false,
           WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
-                                                                     BType,
-                                                                     AccType,
-                                                                     MPerWave,
-                                                                     NPerWave,
-                                                                     KPerWave,
-                                                                     TransposeC,
-                                                                     SwizzleA,
-                                                                     UseStructuredSparsity,
-                                                                     AttrNumAccess>::Type;
+using WarpGemmDispatcher = typename impl::WarpGemmDispatcher<AType,
+                                                             BType,
+                                                             AccType,
+                                                             MPerWave,
+                                                             NPerWave,
+                                                             KPerWave,
+                                                             TransposeC,
+                                                             SwizzleA,
+                                                             UseStructuredSparsity,
+                                                             AttrNumAccess>::Type;
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp
new file mode 100644
index 0000000000..cf477f7928
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp"
+
+namespace ck_tile {
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_f16_f16 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_bf16_bf16 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_i32_16x16x16_i8_i8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_f8_f8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_bf8_bf8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_f8_bf8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_bf8_f8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8, kTransC>>;
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
index 4c136e78f7..c6b8882946 100644
--- a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -156,6 +156,8 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
 
         static constexpr index_t KPack      = WarpGemm::kKPerThread;
         static constexpr index_t KPerThread = KIterPerWarp * WarpGemm::kKPerThread;
+
+        static constexpr bool Preshuffle = Problem::Traits::Preshuffle;
     };
 
     public:
@@ -322,6 +324,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
             static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                           "The CDataType as defined in traits should be the same as correspoinding "
                           "C block tensor data type!");
+            constexpr auto warp_size = get_warp_size();
 
             // hot loop:
             static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
@@ -354,82 +357,153 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
                             }
                         });
 
-                        // Need to multiply aquant with accumulated C
-                        //
-                        // The accumulated C tile has the standard distribution. For example
-                        // lane 0 holds elements [0,0], [1,0], [2,0], [3,0], [8,0], [9,0],
-                        // [10,0], [11,0], [16,0], [17,0], [18,0], [19,0], [24,0], [25,0],
-                        // [26,0], [27,0].
-                        //
-                        // These elements are in different rows, need to get the scale value
-                        // for the corresponding row.
-                        // Based on aquant's tile distribution, it can be inferred which
-                        // lane holds the relevant scale. For example, the scales corresponding
-                        // to the 16 elements held by lane 0 are held by lanes 0, 1, 2, 3, 8, 9,
-                        // 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 respectively.
-                        //
-                        // These scales can be obtained using __builtin_amdgcn_ds_bpermute.
+                        if constexpr(Traits::Preshuffle)
+                        {
+                            // A view is created on top of the preshuffled AQ, where each row of the
+                            // view is composed of a row from a warp tile within an AQ block tile.
+                            // Multiple warp tile rows that belong to the same block tile are laid
+                            // out as consecutive rows.
+                            //
+                            // When we need to multiply a C warp tile with an AQ warp tile, thread 0
+                            // in the warp will load AQ_warp_tile[0], thread 1 will load
+                            // AQ_warp_tile[1], and so on, up to thread 63, which will load
+                            // AQ_warp_tile[63]. The VGPR file in the warp acts similarly to LDS in
+                            // this context, but we use cross-lane operations to access the data.
+                            // (Cross-lane operations are faster than using LDS.)
+                            //
+                            // Note that when the size of the AQ warp tile is smaller than the warp
+                            // size, you need to pad the rows in the view to ensure that each thread
+                            // can read one element.
+                            constexpr auto tbuf_offset =
+                                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                           merge_sequences(sequence<mIter, nIter>{},
+                                                           c_warp_y_index_zeros)) /
+                                       CBlockTensor::PackedSize>{};
+                            constexpr uint32_t kTileRowsOfCPerThread = 4;
 
-                        // MIters per warp
-                        constexpr index_t mIters_per_warp = get_warp_size() / WarpGemm::kM;
+                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                                [&](auto c_row) {
+                                    // For a warp tile of [16x16x32], take thread 0 as an example.
+                                    // Its VGPR[0] stores the value from C_tile[0,0], VGPR[1] stores
+                                    // C_tile[1,0], VGPR[2] stores C_tile[2,0], and VGPR[3] stores
+                                    // C_tile[3,0]. This means VGPR[0] should be multiplied by
+                                    // AQ_tile[0, 0], VGPR[1] by AQ_tile[1, 0], VGPR[2] by
+                                    // AQ_tile[2, 0], and VGPR[3] by AQ_tile[3, 0].
 
-                        // Reg block offset based on mIter
-                        constexpr index_t reg_block_offset =
-                            ((mIter / mIters_per_warp) * Traits::AQPerBlock);
+                                    // Thread 0 can read AQ_tile[0, 0] from itself, AQ_tile[1, 0]
+                                    // from thread 1, ..., and AQ_tile[3, 0] from thread 3.
+                                    auto pull_from_lane =
+                                        ((threadIdx.x & (warp_size - 1)) / Traits::WarpGemm::kN *
+                                             kTileRowsOfCPerThread +
+                                         c_row) *
+                                            Traits::QScalesPerBlockRow +
+                                        kQScale;
+                                    auto& scale_reg = aq_block_tensor.get_thread_buffer()[mIter];
 
-                        constexpr index_t lane_base_offset =
-                            (mIter % mIters_per_warp) * WarpGemm::kM;
+                                    // cross lane ops
+                                    uint32_t scale_reg_dword;
 
-                        // Scale tensor offset along K
-                        constexpr index_t src_reg_offset = reg_block_offset + kQScale;
+                                    if constexpr(std::is_same_v<AQDataType, float>)
+                                    {
+                                        scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                                    }
+                                    else
+                                    {
+                                        scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                                    }
 
-                        constexpr uint32_t kTileRows        = 4;
-                        constexpr uint32_t kTiledCMsPerWarp = WarpGemm::kCMLane * kTileRows;
+                                    int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                        pull_from_lane << 2,
+                                        __builtin_bit_cast(int, scale_reg_dword));
 
-                        constexpr auto tbuf_offset =
-                            number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
-                                       merge_sequences(sequence<mIter, nIter>{},
-                                                       c_warp_y_index_zeros)) /
-                                   CBlockTensor::PackedSize>{};
+                                    float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
 
-                        static_for<0, WarpGemm::kM, WarpGemm::kCMLane>{}([&](auto c_row) {
-                            // Multiply by 4 because output is stored in tiles of 4
-                            // x CNLane
-                            constexpr uint32_t row_base =
-                                ((c_row / kTiledCMsPerWarp) * kTiledCMsPerWarp) +
-                                ((c_row % kTiledCMsPerWarp) / WarpGemm::kCMLane);
+                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                        (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f *
+                                         kA_cvt_scale * kB_cvt_scale);
+                                });
+                        }
+                        else
+                        {
+                            // Need to multiply aquant with accumulated C
+                            //
+                            // The accumulated C tile has the standard distribution. For example
+                            // lane 0 holds elements [0,0], [1,0], [2,0], [3,0], [8,0], [9,0],
+                            // [10,0], [11,0], [16,0], [17,0], [18,0], [19,0], [24,0], [25,0],
+                            // [26,0], [27,0].
+                            //
+                            // These elements are in different rows, need to get the scale value
+                            // for the corresponding row.
+                            // Based on aquant's tile distribution, it can be inferred which
+                            // lane holds the relevant scale. For example, the scales corresponding
+                            // to the 16 elements held by lane 0 are held by lanes 0, 1, 2, 3, 8, 9,
+                            // 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 respectively.
+                            //
+                            // These scales can be obtained using __builtin_amdgcn_ds_bpermute.
 
-                            constexpr uint32_t reg_offset_for_row_data = c_row / WarpGemm::kCMLane;
+                            // MIters per warp
+                            constexpr index_t mIters_per_warp = get_warp_size() / WarpGemm::kM;
 
-                            // Lane index to source scale from
-                            uint32_t src_lane_idx = lane_base_offset + row_base +
-                                                    (__lane_id() / WarpGemm::kN * kTileRows);
+                            // Reg block offset based on mIter
+                            constexpr index_t reg_block_offset =
+                                ((mIter / mIters_per_warp) * Traits::AQPerBlock);
 
-                            // Directly index into thread buffer corresponding to
-                            // desired row coefficient
-                            auto& scale_reg = aq_block_tensor.get_thread_buffer()[src_reg_offset];
-                            uint32_t scale_reg_dword;
+                            constexpr index_t lane_base_offset =
+                                (mIter % mIters_per_warp) * WarpGemm::kM;
 
-                            if constexpr(std::is_same_v<AQDataType, float>)
-                            {
-                                scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
-                            }
-                            else
-                            {
-                                scale_reg_dword = static_cast<uint32_t>(scale_reg);
-                            }
+                            // Scale tensor offset along K
+                            constexpr index_t src_reg_offset = reg_block_offset + kQScale;
 
-                            // Pull scale data across lanes
-                            int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
-                                src_lane_idx * 4, __builtin_bit_cast(int, scale_reg_dword));
+                            constexpr uint32_t kTileRows        = 4;
+                            constexpr uint32_t kTiledCMsPerWarp = WarpGemm::kCMLane * kTileRows;
 
-                            float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+                            constexpr auto tbuf_offset =
+                                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                           merge_sequences(sequence<mIter, nIter>{},
+                                                           c_warp_y_index_zeros)) /
+                                       CBlockTensor::PackedSize>{};
 
-                            c_block_tensor
-                                .get_thread_buffer()[tbuf_offset + reg_offset_for_row_data] +=
-                                (c_warp_tensor.get_thread_buffer()[reg_offset_for_row_data] *
-                                 scale_reg_f * kA_cvt_scale * kB_cvt_scale);
-                        });
+                            static_for<0, WarpGemm::kM, WarpGemm::kCMLane>{}([&](auto c_row) {
+                                // Multiply by 4 because output is stored in tiles of 4
+                                // x CNLane
+                                constexpr uint32_t row_base =
+                                    ((c_row / kTiledCMsPerWarp) * kTiledCMsPerWarp) +
+                                    ((c_row % kTiledCMsPerWarp) / WarpGemm::kCMLane);
+
+                                constexpr uint32_t reg_offset_for_row_data =
+                                    c_row / WarpGemm::kCMLane;
+
+                                // Lane index to source scale from
+                                uint32_t src_lane_idx = lane_base_offset + row_base +
+                                                        (__lane_id() / WarpGemm::kN * kTileRows);
+
+                                // Directly index into thread buffer corresponding to
+                                // desired row coefficient
+                                auto& scale_reg =
+                                    aq_block_tensor.get_thread_buffer()[src_reg_offset];
+                                uint32_t scale_reg_dword;
+
+                                if constexpr(std::is_same_v<AQDataType, float>)
+                                {
+                                    scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                                }
+                                else
+                                {
+                                    scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                                }
+
+                                // Pull scale data across lanes
+                                int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                    src_lane_idx * 4, __builtin_bit_cast(int, scale_reg_dword));
+
+                                float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+
+                                c_block_tensor
+                                    .get_thread_buffer()[tbuf_offset + reg_offset_for_row_data] +=
+                                    (c_warp_tensor.get_thread_buffer()[reg_offset_for_row_data] *
+                                     scale_reg_f * kA_cvt_scale * kB_cvt_scale);
+                            });
+                        }
                     });
                 });
             });
diff --git a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp b/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
index b1f89fe2e2..6973c80d57 100644
--- a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
@@ -3,11 +3,14 @@
 
 #pragma once
 
-#include <iostream>
 #include <string>
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/host/concat.hpp"
 
 namespace ck_tile {
@@ -96,14 +99,15 @@ struct AQuantGemmKernelArgs
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
 struct AQuantGemmKernel
 {
-    using TilePartitioner                    = remove_cvref_t<TilePartitioner_>;
-    using GemmPipeline                       = remove_cvref_t<GemmPipeline_>;
-    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
-    using ALayout                            = remove_cvref_t<typename GemmPipeline::ALayout>;
-    using AQLayout                           = remove_cvref_t<typename GemmPipeline::AQLayout>;
-    using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
-    using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
-    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+    using TilePartitioner               = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline                  = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline              = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout                       = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using AQLayout                      = remove_cvref_t<typename GemmPipeline::AQLayout>;
+    using BLayout                       = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout                       = remove_cvref_t<typename GemmPipeline::CLayout>;
+    static constexpr index_t kBlockSize = GemmPipeline::BlockSize;
+    static constexpr bool Preshuffle    = GemmPipeline::Preshuffle;
 
     using ADataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
     using AQDataType = remove_cvref_t<typename GemmPipeline::AQDataType>;
@@ -127,7 +131,7 @@ struct AQuantGemmKernel
         return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
 
     CK_TILE_HOST static constexpr AQuantGemmKernelArgs
     MakeKernelArgs(const AQuantGemmHostArgs& hostArgs)
@@ -157,7 +161,7 @@ struct AQuantGemmKernel
         __device__ SplitKBatchOffset(const AQuantGemmKernelArgs& kargs,
                                      const std::size_t k_id = blockIdx.z)
         {
-            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(I2);
             const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
             const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
 
@@ -372,14 +376,75 @@ struct AQuantGemmKernel
             }
         }();
 
+        const auto get_padding_size = [](index_t length, index_t alignment) {
+            return ck_tile::integer_least_multiple(length, alignment) - length;
+        };
+
+        const auto& make_preshuffled_aq_tensor_view = [&]() {
+            const auto aq_x = kargs.M * GemmPipeline::KPerBlockAQ;
+            const auto aq_y = kargs.QK / GemmPipeline::KPerBlockAQ;
+
+            const auto aq_desc =
+                make_naive_tensor_descriptor(make_tuple(aq_y, aq_x),
+                                             make_tuple(aq_x, 1),
+                                             number<GemmPipeline::GetVectorSizeAQ()>{},
+                                             number<1>{});
+
+            const auto block_tile_size = GemmPipeline::MPerBlock * GemmPipeline::KPerBlockAQ;
+            const auto aq_pad0_desc    = transform_tensor_descriptor(
+                aq_desc,
+                make_tuple(make_pass_through_transform(aq_y),
+                           make_right_pad_transform(aq_x, get_padding_size(aq_x, block_tile_size))),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            const auto pad_aq_x = aq_pad0_desc.get_lengths()[I1];
+            const auto wave_tile_size =
+                TilePartitioner::BlockGemmShape::WarpTile::at(I0) * GemmPipeline::KPerBlockAQ;
+            const auto wave_tile_count_x = ck_tile::integer_divide_ceil(pad_aq_x, wave_tile_size);
+            const auto aq_unmerge_pad0_desc = transform_tensor_descriptor(
+                aq_pad0_desc,
+                make_tuple(make_pass_through_transform(aq_y),
+                           make_unmerge_transform(make_tuple(wave_tile_count_x, wave_tile_size))),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+            const auto aq_pad1_desc = transform_tensor_descriptor(
+                aq_unmerge_pad0_desc,
+                make_tuple(make_pass_through_transform(aq_y),
+                           make_pass_through_transform(wave_tile_count_x),
+                           make_right_pad_transform(
+                               wave_tile_size, get_padding_size(wave_tile_size, get_warp_size()))),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+            const auto pad_wave_size =
+                ck_tile::integer_least_multiple(wave_tile_size, get_warp_size());
+            const auto aq_merge_pad1_desc = transform_tensor_descriptor(
+                aq_pad1_desc,
+                make_tuple(make_merge_transform(make_tuple(wave_tile_count_x, aq_y)),
+                           make_pass_through_transform(pad_wave_size)),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return make_tensor_view<address_space_enum::global>(aq_ptr, aq_merge_pad1_desc);
+        };
+
         const auto& aq_tensor_view = [&]() {
             static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-            return make_naive_tensor_view<address_space_enum::global>(
-                aq_ptr,
-                make_tuple(kargs.M, kargs.QK),
-                make_tuple(kargs.stride_AQ, 1),
-                number<GemmPipeline::GetVectorSizeAQ()>{},
-                number<1>{});
+            if constexpr(Preshuffle)
+            {
+                return make_preshuffled_aq_tensor_view();
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    aq_ptr,
+                    make_tuple(kargs.M, kargs.QK),
+                    make_tuple(kargs.stride_AQ, 1),
+                    number<GemmPipeline::GetVectorSizeAQ()>{},
+                    number<1>{});
+            }
         }();
 
         const auto& b_tensor_view = [&]() {
@@ -491,16 +556,7 @@ struct AQuantGemmKernel
             }
         }();
 
-        const auto& aq_pad_view = [&]() {
-            const auto& aq_tensor_view = views.at(I1);
-            static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
-            return pad_tensor_view(
-                aq_tensor_view,
-                make_tuple(number<TilePartitioner::MPerBlock>{},
-                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
-                // TODO: Add support for padding.
-                sequence<false, false>{});
-        }();
+        const auto& aq_pad_view = [&]() { return views.at(I1); }();
 
         const auto& b_pad_view = [&]() {
             const auto& b_tensor_view = views.at(I2);
@@ -543,8 +599,10 @@ struct AQuantGemmKernel
     }
 
     template <typename PadView>
-    CK_TILE_DEVICE static auto
-    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    CK_TILE_DEVICE static auto MakeGemmTileWindows(const PadView& views,
+                                                   const AQuantGemmKernelArgs& kargs,
+                                                   const index_t i_m,
+                                                   const index_t i_n)
     {
         const auto& a_pad_view  = views.at(I0);
         const auto& aq_pad_view = views.at(I1);
@@ -570,11 +628,26 @@ struct AQuantGemmKernel
 
         const auto& aq_block_window = [&]() {
             static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-            return make_tile_window(
-                aq_pad_view,
-                make_tuple(number<TilePartitioner::MPerBlock>{},
-                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
-                {i_m, 0});
+            if constexpr(Preshuffle)
+            {
+                constexpr auto tile_window_width = get_warp_size();
+                constexpr auto tile_window_height =
+                    TilePartitioner::MPerBlock / TilePartitioner::BlockGemmShape::WarpTile::at(I0);
+                auto block_m_idx = i_m / TilePartitioner::MPerBlock;
+                return make_tile_window(
+                    aq_pad_view,
+                    make_tuple(number<tile_window_height>{}, number<tile_window_width>{}),
+                    {block_m_idx * kargs.K / TilePartitioner::BlockGemmShape::BlockTile::at(I2),
+                     0});
+            }
+            else
+            {
+                return make_tile_window(
+                    aq_pad_view,
+                    make_tuple(number<TilePartitioner::MPerBlock>{},
+                               number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
+                    {i_m, 0});
+            }
         }();
 
         const auto& b_block_window = [&]() {
@@ -633,7 +706,8 @@ struct AQuantGemmKernel
             a_ptr, b_ptr, aq_ptr, c_ptr, kargs, splitk_batch_offset);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+        auto gemm_tile_windows =
+            MakeGemmTileWindows(gemm_pad_views, kargs, block_idx_m, block_idx_n);
 
         const index_t num_loop = __builtin_amdgcn_readfirstlane(
             TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
index 1356d7e222..ed13adf10e 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
@@ -38,12 +38,9 @@ struct GemmAQuantPipelineAgBgCrImplBase : public GemmPipelineAgBgCrImplBase<Prob
     {
         static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
 
-        using YPerTile = number<MPerBlock>;
-        using XPerTile = number<KPerBlockAQ>;
-
         auto aq_copy_dram_window =
             make_tile_window(aq_dram_block_window_tmp.get_bottom_tensor_view(),
-                             make_tuple(YPerTile(), XPerTile()),
+                             aq_dram_block_window_tmp.get_window_lengths(),
                              aq_dram_block_window_tmp.get_window_origin(),
                              Policy::template MakeAQDramTileDistribution<Problem>());
         return aq_copy_dram_window;
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
index 2004f7d90e..1fb92ad14d 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
@@ -42,24 +42,45 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
         constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
         constexpr index_t KPerBlockAQ = KPerBlock / Problem::kQuantGroupSize;
         constexpr index_t VecLoadSize = GetVectorSizeAQ<Problem>();
+        constexpr bool Preshuffle     = Problem::Traits::Preshuffle;
         using WarpTile                = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm                = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                               typename Problem::ComputeDataType,
-                                                               typename Problem::CDataType,
-                                                               WarpTile::at(I0),
-                                                               WarpTile::at(I1),
-                                                               WarpTile::at(I2),
-                                                               false>;
+        using WarpGemm                = WarpGemmDispatcher<typename Problem::ComputeDataType,
+                                                           typename Problem::ComputeDataType,
+                                                           typename Problem::CDataType,
+                                                           WarpTile::at(I0),
+                                                           WarpTile::at(I1),
+                                                           WarpTile::at(I2),
+                                                           false>;
 
         static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-        using TileEncodingPattern = TileDistributionEncodingPatternAQ<BlockGemmShape,
-                                                                      WarpGemm,
-                                                                      BlockSize,
-                                                                      MPerBlock,
-                                                                      KPerBlockAQ,
-                                                                      VecLoadSize>;
+        if constexpr(Preshuffle)
+        {
+            using TileEncodingPattern =
+                TileDistributionEncodingPatternAQ<BlockGemmShape,
+                                                  WarpGemm,
+                                                  BlockSize,
+                                                  MPerBlock / WarpGemm::kM,
+                                                  ck_tile::integer_least_multiple(
+                                                      WarpGemm::kM * KPerBlockAQ, get_warp_size()),
+                                                  KPerBlockAQ,
+                                                  VecLoadSize,
+                                                  Preshuffle>;
 
-        return TileEncodingPattern::Make2DStaticTileDistribution();
+            return TileEncodingPattern::Make2DStaticTileDistribution();
+        }
+        else
+        {
+            using TileEncodingPattern = TileDistributionEncodingPatternAQ<BlockGemmShape,
+                                                                          WarpGemm,
+                                                                          BlockSize,
+                                                                          MPerBlock,
+                                                                          KPerBlockAQ,
+                                                                          KPerBlockAQ,
+                                                                          VecLoadSize,
+                                                                          Preshuffle>;
+
+            return TileEncodingPattern::Make2DStaticTileDistribution();
+        }
     }
 
     template <typename Problem>
@@ -71,13 +92,13 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
         static_assert(Problem::kQuantGroupSize % WarpTile::at(I2) == 0,
                       "KPerWarpGemm must be a multiple of kQuantGroupSize!");
 
-        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                typename Problem::ComputeDataType,
-                                                typename Problem::CDataType,
-                                                WarpTile::at(I0),
-                                                WarpTile::at(I1),
-                                                WarpTile::at(I2),
-                                                false>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::ComputeDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::CDataType,
+                                            WarpTile::at(I0),
+                                            WarpTile::at(I1),
+                                            WarpTile::at(I2),
+                                            false>;
         static_assert(std::is_same_v<typename Problem::ComputeDataType, fp8_t> ||
                       std::is_same_v<typename Problem::ComputeDataType, bf8_t>);
         static_assert(std::is_same_v<typename Problem::CDataType, float>);
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
index 746396b13a..64b2402aa5 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
@@ -7,7 +7,6 @@
 #include <sstream>
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/host/concat.hpp"
@@ -134,6 +133,7 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
     static constexpr bool kPadK = Problem::kPadK;
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr bool Preshuffle       = Problem::Traits::Preshuffle;
 
     static constexpr bool HasHotLoop = Problem::HasHotLoop;
     static constexpr auto TailNum    = Problem::TailNum;
@@ -254,9 +254,6 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
             constexpr bool is_b_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
 
             static_assert(!is_aq_col_major, "Aq must be row major (col major not supported yet)");
-            static_assert(MPerBlock == AQDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
-                              KPerBlockAQ == AQDramBlockWindowTmp{}.get_window_lengths()[I1{}],
-                          "Aq block window has incorrect lengths for defined AqLayout!");
 
             static_assert(is_a_col_major
                               ? (KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
@@ -312,8 +309,11 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
                 is_a_col_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
             constexpr BDramTileWindowStep b_dram_tile_window_step =
                 is_b_row_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
+
+            // only row_major for AQ
             constexpr AQDramTileWindowStep aq_dram_tile_window_step =
-                is_aq_col_major ? make_array(KPerBlockAQ, 0) : make_array(0, KPerBlockAQ);
+                Preshuffle ? make_array(MPerBlock / BlockGemm::WarpGemm::kM, 0)
+                           : make_array(0, KPerBlockAQ);
 
             // DRAM prefetch (global read 0)
             Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
index c018314ab7..051543b8b6 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
@@ -50,10 +50,11 @@ template <typename BlockGemmShape,
           index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
-          index_t VecSize>
+          index_t KPerBlockAQ,
+          index_t VecSize,
+          bool Preshuffle>
 struct TileDistributionEncodingPatternAQ : public TileDistributionEncodingPattern
 {
-    // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
     static constexpr index_t warp_size = get_warp_size();
     static constexpr index_t num_warps = BlockSize / get_warp_size();
@@ -69,26 +70,46 @@ struct TileDistributionEncodingPatternAQ : public TileDistributionEncodingPatter
     // KWarps > 1 isn't supported
     static_assert(KWarps == 1);
 
-    // # of elements per thread
-    static constexpr index_t X = XPerTile;
-
-    static constexpr index_t Y0 = 1;
-    static constexpr index_t Y1 = MIterPerWarp ? MIterPerWarp : 1;
-    static constexpr index_t Y2 = MWarps;
-    static constexpr index_t Y3 = WarpGemm::kM;
-    static_assert(Y3 >= WarpGemm::kM, "Scales for all rows must be available within the warp.");
-    static_assert(Y0 * Y1 * Y2 * Y3 == YPerTile,
-                  "Y0, Y1, Y2, Y3 must cover the blocktile along Y.");
-
     CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
     {
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<NWarps>,
-                                       tuple<sequence<Y0, Y1, Y2, Y3>, sequence<X>>,
-                                       tuple<sequence<1, 0>, sequence<1, 1>>,
-                                       tuple<sequence<2, 0>, sequence<0, 3>>,
-                                       sequence<1, 2>,
-                                       sequence<1, 0>>{});
+        if constexpr(Preshuffle)
+        {
+            // # of elements per thread
+            constexpr index_t X2 = KPerBlockAQ;
+            constexpr index_t X1 = warp_size / X2;
+            constexpr index_t X0 = XPerTile / warp_size;
+
+            constexpr index_t Y1 = MWarps;
+            constexpr index_t Y0 = YPerTile / Y1;
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<NWarps>,
+                                           tuple<sequence<Y0, Y1>, sequence<X0, X1, X2>>,
+                                           tuple<sequence<1, 0>, sequence<2, 2>>,
+                                           tuple<sequence<1, 0>, sequence<1, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<0, 0>>{});
+        }
+        else
+        {
+            // # of elements per thread
+            constexpr index_t X = XPerTile;
+
+            constexpr index_t Y0 = 1;
+            constexpr index_t Y1 = MIterPerWarp ? MIterPerWarp : 1;
+            constexpr index_t Y2 = MWarps;
+            constexpr index_t Y3 = WarpGemm::kM;
+            static_assert(Y3 >= WarpGemm::kM,
+                          "Scales for all rows must be available within the warp.");
+            static_assert(Y0 * Y1 * Y2 * Y3 == YPerTile,
+                          "Y0, Y1, Y2, Y3 must cover the blocktile along Y.");
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<NWarps>,
+                                           tuple<sequence<Y0, Y1, Y2, Y3>, sequence<X>>,
+                                           tuple<sequence<1, 0>, sequence<1, 1>>,
+                                           tuple<sequence<2, 0>, sequence<0, 3>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 0>>{});
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
index 4972badb3f..41f8f1deef 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
@@ -10,6 +10,7 @@ namespace ck_tile {
 template <bool kPadM_,
           bool kPadN_,
           bool kPadK_,
+          bool Preshuffle_,
           typename ALayout_,
           typename BLayout_,
           typename CLayout_,
@@ -29,6 +30,7 @@ struct TileGemmAQuantTraits
 
     static constexpr bool UseStructuredSparsity = false;
     static constexpr index_t NumWaveGroups      = 1;
+    static constexpr bool Preshuffle            = Preshuffle_;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/grouped_convolution.hpp b/include/ck_tile/ops/grouped_convolution.hpp
index 29332f941a..09b50f26b0 100644
--- a/include/ck_tile/ops/grouped_convolution.hpp
+++ b/include/ck_tile/ops/grouped_convolution.hpp
@@ -3,10 +3,12 @@
 
 #pragma once
 
+#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp"
 #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp"
 #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
new file mode 100644
index 0000000000..282a187eae
--- /dev/null
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
@@ -0,0 +1,985 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
+
+namespace ck_tile {
+
+/// @brief The Grouped Convolution kernel device arguments.
+template <typename GroupedConvTraitsType_, typename TilePartitioner_>
+struct GroupedConvBwdDataKernelArgs
+{
+    using TilePartitioner = remove_cvref_t<TilePartitioner_>;
+
+    using ConvToGemmTransformer =
+        TransformConvBwdDataToGemm<GroupedConvTraitsType_::NDimSpatial,
+                                   GroupedConvTraitsType_::ConvSpecialization>;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType_::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType_::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType_::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvBwdDataKernelArgs(const GroupedConvBwdDataHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.input_spatial_lengths_[0])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[0])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0])};
+
+        k_batch = args.k_batch;
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        const index_t X               = wei_g_k_c_xs_lengths[3];
+        const index_t ConvStrideW     = conv_filter_strides[0];
+        const index_t ConvDilationW   = conv_filter_dilations[0];
+        const auto GcdStrideDilationW = gcd(ConvStrideW, ConvDilationW);
+        const auto XTilde             = ConvStrideW / GcdStrideDilationW;
+
+        for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+        {
+            const auto XDotSlice = integer_divide_ceil(X - i_xtilde, XTilde);
+
+            if(XDotSlice <= 0)
+            {
+                continue;
+            }
+
+            if(gemm_count >= MaxGroupedGemmGroupsNum)
+            {
+                gemm_count++;
+                // Avoid array segfault
+                continue;
+            }
+
+            tildes = {i_xtilde};
+
+            ConvToGemmTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                           wei_g_k_c_xs_lengths,
+                                                           out_g_n_k_wos_lengths,
+                                                           conv_filter_strides,
+                                                           conv_filter_dilations,
+                                                           input_left_pads,
+                                                           input_right_pads,
+                                                           tildes};
+
+            auto grid_descs =
+                conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<
+                    GroupedConvTraitsType_::NDimSpatial>(1);
+
+            a_grid_descs_m_k[gemm_count] = grid_descs.at(number<0>{});
+            b_grid_descs_n_k[gemm_count] = grid_descs.at(number<1>{});
+            c_grid_descs_m_n[gemm_count] = grid_descs.at(number<2>{});
+
+            const index_t grid_size_grp =
+                TilePartitioner::GridSize(c_grid_descs_m_n[gemm_count].get_length(I0),
+                                          c_grid_descs_m_n[gemm_count].get_length(I1));
+
+            block_starts[gemm_count] = grid_size_;
+            block_ends[gemm_count]   = grid_size_ + grid_size_grp;
+
+            grid_size_ += grid_size_grp;
+
+            ++gemm_count;
+        }
+        group_stride_a = args.K_; // A: Out NWGK
+        group_stride_b = args.K_ * args.C_ *
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>()); // B: Wei GKXC
+        group_stride_c = args.C_;                                     // C: In  NWGC
+
+        GemmBatch = args.G_;
+    }
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType_::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType_::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType_::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NHWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvBwdDataKernelArgs(const GroupedConvBwdDataHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.input_spatial_lengths_[1])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[1])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[1])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
+                                 static_cast<index_t>(args.conv_filter_strides_[1])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[1])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
+                                 static_cast<index_t>(args.input_left_pads_[1])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
+                                 static_cast<index_t>(args.input_right_pads_[1])};
+
+        k_batch = args.k_batch;
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        const index_t Y               = wei_g_k_c_xs_lengths[3];
+        const index_t X               = wei_g_k_c_xs_lengths[4];
+        const index_t ConvStrideH     = conv_filter_strides[0];
+        const index_t ConvStrideW     = conv_filter_strides[1];
+        const index_t ConvDilationH   = conv_filter_dilations[0];
+        const index_t ConvDilationW   = conv_filter_dilations[1];
+        const auto GcdStrideDilationH = gcd(ConvStrideH, ConvDilationH);
+        const auto GcdStrideDilationW = gcd(ConvStrideW, ConvDilationW);
+        const auto YTilde             = ConvStrideH / GcdStrideDilationH;
+        const auto XTilde             = ConvStrideW / GcdStrideDilationW;
+
+        for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+        {
+            for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+            {
+                const auto YDotSlice = integer_divide_ceil(Y - i_ytilde, YTilde);
+                const auto XDotSlice = integer_divide_ceil(X - i_xtilde, XTilde);
+
+                if(XDotSlice * YDotSlice <= 0)
+                {
+                    continue;
+                }
+
+                if(gemm_count >= MaxGroupedGemmGroupsNum)
+                {
+                    gemm_count++;
+                    // Avoid array segfault
+                    continue;
+                }
+
+                tildes = {i_ytilde, i_xtilde};
+
+                ConvToGemmTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                               wei_g_k_c_xs_lengths,
+                                                               out_g_n_k_wos_lengths,
+                                                               conv_filter_strides,
+                                                               conv_filter_dilations,
+                                                               input_left_pads,
+                                                               input_right_pads,
+                                                               tildes};
+
+                auto grid_descs = conv_to_gemm_transformer
+                                      .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<
+                                          GroupedConvTraitsType_::NDimSpatial>(1);
+
+                a_grid_descs_m_k[gemm_count] = grid_descs.at(number<0>{});
+                b_grid_descs_n_k[gemm_count] = grid_descs.at(number<1>{});
+                c_grid_descs_m_n[gemm_count] = grid_descs.at(number<2>{});
+
+                const index_t grid_size_grp =
+                    TilePartitioner::GridSize(c_grid_descs_m_n[gemm_count].get_length(I0),
+                                              c_grid_descs_m_n[gemm_count].get_length(I1));
+
+                block_starts[gemm_count] = grid_size_;
+                block_ends[gemm_count]   = grid_size_ + grid_size_grp;
+
+                grid_size_ += grid_size_grp;
+
+                ++gemm_count;
+            }
+        }
+        group_stride_a = args.K_; // A: Out NWGK
+        group_stride_b = args.K_ * args.C_ *
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>()); // B: Wei GKXC
+        group_stride_c = args.C_;                                     // C: In  NWGC
+
+        GemmBatch = args.G_;
+    }
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType_::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType_::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType_::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NDHWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvBwdDataKernelArgs(const GroupedConvBwdDataHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.input_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.input_spatial_lengths_[2])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[2])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[2])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
+                                 static_cast<index_t>(args.conv_filter_strides_[1]),
+                                 static_cast<index_t>(args.conv_filter_strides_[2])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[1]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[2])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
+                                 static_cast<index_t>(args.input_left_pads_[1]),
+                                 static_cast<index_t>(args.input_left_pads_[2])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
+                                 static_cast<index_t>(args.input_right_pads_[1]),
+                                 static_cast<index_t>(args.input_right_pads_[2])};
+
+        k_batch = args.k_batch;
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        const index_t Z               = wei_g_k_c_xs_lengths[3];
+        const index_t Y               = wei_g_k_c_xs_lengths[4];
+        const index_t X               = wei_g_k_c_xs_lengths[5];
+        const index_t ConvStrideD     = conv_filter_strides[0];
+        const index_t ConvStrideH     = conv_filter_strides[1];
+        const index_t ConvStrideW     = conv_filter_strides[2];
+        const index_t ConvDilationD   = conv_filter_dilations[0];
+        const index_t ConvDilationH   = conv_filter_dilations[1];
+        const index_t ConvDilationW   = conv_filter_dilations[2];
+        const auto GcdStrideDilationD = gcd(ConvStrideD, ConvDilationD);
+        const auto GcdStrideDilationH = gcd(ConvStrideH, ConvDilationH);
+        const auto GcdStrideDilationW = gcd(ConvStrideW, ConvDilationW);
+        const auto ZTilde             = ConvStrideD / GcdStrideDilationD;
+        const auto YTilde             = ConvStrideH / GcdStrideDilationH;
+        const auto XTilde             = ConvStrideW / GcdStrideDilationW;
+
+        for(index_t i_ztilde = 0; i_ztilde < ZTilde; ++i_ztilde)
+        {
+            for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+            {
+                for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+                {
+                    const auto ZDotSlice = integer_divide_ceil(Z - i_ztilde, ZTilde);
+                    const auto YDotSlice = integer_divide_ceil(Y - i_ytilde, YTilde);
+                    const auto XDotSlice = integer_divide_ceil(X - i_xtilde, XTilde);
+
+                    if(ZDotSlice * XDotSlice * YDotSlice <= 0)
+                    {
+                        continue;
+                    }
+
+                    if(gemm_count >= MaxGroupedGemmGroupsNum)
+                    {
+                        gemm_count++;
+                        // Avoid array segfault
+                        continue;
+                    }
+
+                    tildes = {i_ztilde, i_ytilde, i_xtilde};
+
+                    ConvToGemmTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                                   wei_g_k_c_xs_lengths,
+                                                                   out_g_n_k_wos_lengths,
+                                                                   conv_filter_strides,
+                                                                   conv_filter_dilations,
+                                                                   input_left_pads,
+                                                                   input_right_pads,
+                                                                   tildes};
+
+                    auto grid_descs = conv_to_gemm_transformer
+                                          .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<
+                                              GroupedConvTraitsType_::NDimSpatial>(1);
+
+                    a_grid_descs_m_k[gemm_count] = grid_descs.at(number<0>{});
+                    b_grid_descs_n_k[gemm_count] = grid_descs.at(number<1>{});
+                    c_grid_descs_m_n[gemm_count] = grid_descs.at(number<2>{});
+
+                    const index_t grid_size_grp =
+                        TilePartitioner::GridSize(c_grid_descs_m_n[gemm_count].get_length(I0),
+                                                  c_grid_descs_m_n[gemm_count].get_length(I1));
+
+                    block_starts[gemm_count] = grid_size_;
+                    block_ends[gemm_count]   = grid_size_ + grid_size_grp;
+
+                    grid_size_ += grid_size_grp;
+
+                    ++gemm_count;
+                }
+            }
+        }
+
+        group_stride_a = args.K_; // A: Out NWGK
+        group_stride_b = args.K_ * args.C_ *
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>()); // B: Wei GKXC
+        group_stride_c = args.C_;                                     // C: In  NWGC
+
+        GemmBatch = args.G_; // C: In  NWGC
+    }
+
+    static constexpr index_t MaxGroupedGemmGroupsNum = 128;
+
+    using ABCGridDescs =
+        remove_cvref_t<decltype(ConvToGemmTransformer{}
+                                    .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(1))>;
+
+    using AGridDescMK = remove_cvref_t<decltype(ABCGridDescs{}[number<0>{}])>;
+    using BGridDescNK = remove_cvref_t<decltype(ABCGridDescs{}[number<1>{}])>;
+    using CGridDescMN = remove_cvref_t<decltype(ABCGridDescs{}[number<2>{}])>;
+
+    static constexpr index_t NonSpatialDims = 3;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType_::NDimSpatial> in_g_n_c_wis_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType_::NDimSpatial> wei_g_k_c_xs_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType_::NDimSpatial> out_g_n_k_wos_lengths;
+
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> conv_filter_strides;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> conv_filter_dilations;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> input_left_pads;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> input_right_pads;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> tildes;
+
+    index_t k_batch;
+    index_t GemmBatch;
+    index_t grid_size_ = 0;
+    index_t gemm_count = 0;
+
+    const void* out_ptr;
+    void* in_ptr;
+    std::array<const void*, NumDTensor> ds_ptr;
+    const void* wei_ptr;
+
+    array<AGridDescMK, MaxGroupedGemmGroupsNum> a_grid_descs_m_k;
+    array<BGridDescNK, MaxGroupedGemmGroupsNum> b_grid_descs_n_k;
+    array<CGridDescMN, MaxGroupedGemmGroupsNum> c_grid_descs_m_n;
+
+    array<index_t, MaxGroupedGemmGroupsNum> block_starts;
+    array<index_t, MaxGroupedGemmGroupsNum> block_ends;
+
+    long_index_t group_stride_a;
+    long_index_t group_stride_b;
+    long_index_t group_stride_c;
+};
+
+/// @brief The Grouped Convolution Backward Data kernel template.
+///
+/// @paragraph Overview Overview
+///            This class provides the grouped convolution backward data kernel template. By
+///            semantic division of Implicit GEMM algorithm into following parts we achieve
+///            flexible, versatile and robust kernel implementation.
+///
+///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
+///                function call operator" which determines the work scope of each workgroup.
+///            @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
+///                This is the place where each workgroup is loading data from global memory and
+///                carrying out dot products.
+///            @li @b Epilogue - The @a "final" part of matrix multiplication implementation
+///                 responsible for storing results to global memory. This is also the place where
+///                 any additional operator fusion may take place.
+///
+///            Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
+///            "EpiloguePipeline" are parameterized with so called @a Policy which determines all
+///            internal details of those functional parts. You can think of it like both gemm and
+///            epilogue pipelines provides the control-flow logic controlled by policies. Moreover
+///            the policy is responsible for definition of all necessary data layouts and thread's
+///            work distribution.
+///
+/// @tparam GroupedConvTraitsType_       The type of class providing traits for grouped convolution.
+/// @tparam TilePartitioner_            The type of class providing mapping of workgroup index into
+/// the
+///                                     output data tile to be calculated. It determines the
+///                                     workgroup to data relationship (or in other words - which
+///                                     data would be processed and calculated by which workgroup).
+/// @tparam GemmPipeline_               The type of class which provides the core part of matrix
+///                                     multiplication. This class should provide implementation of
+///                                     data loading from global memory and performing block-wise
+///                                     matrix multiplication. You can think of it as a work done by
+///                                     single workgroup point of view.
+/// @tparam EpiloguePipeline_           The type of class providing the final part of matrix
+///                                     multiplication implementation. It is responsible for storing
+///                                     results calculated by @ref GemmPipeline_ "GemmPipeline" to
+///                                     the output C tensor in global memory.
+template <typename GroupedConvTraitsType_,
+          typename TilePartitioner_,
+          typename GemmPipeline_,
+          typename EpiloguePipeline_>
+struct GroupedConvolutionBackwardDataKernel
+{
+    static constexpr index_t NDimSpatial = GroupedConvTraitsType_::NDimSpatial_;
+    static constexpr ConvolutionSpecialization ConvSpecialization =
+        GroupedConvTraitsType_::ConvSpecialization;
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+    using GemmALayout      = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using GemmBLayout      = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using GemmCLayout      = remove_cvref_t<typename GemmPipeline::CLayout>;
+
+    using InLayout  = remove_cvref_t<typename GroupedConvTraitsType_::InLayout>;
+    using WeiLayout = remove_cvref_t<typename GroupedConvTraitsType_::WeiLayout>;
+    using OutLayout = remove_cvref_t<typename GroupedConvTraitsType_::OutLayout>;
+    using DsLayout  = remove_cvref_t<typename GroupedConvTraitsType_::DsLayout>;
+
+    using GemmDsLayout                  = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
+
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+
+    using InDataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using WeiDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using DsDataType  = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
+
+    using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    using GroupedConvBwdDataKernelArgsSpecialized =
+        GroupedConvBwdDataKernelArgs<GroupedConvTraitsType_, TilePartitioner>;
+    static constexpr index_t MaxGroupedGemmGroupsNum =
+        GroupedConvBwdDataKernelArgsSpecialized::MaxGroupedGemmGroupsNum;
+
+    // TODO: Enable this
+    static constexpr bool IsSplitKSupported = false;
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+    static constexpr auto I3 = number<3>();
+
+    static_assert(GemmPipeline::kPadM && GemmPipeline::kPadN && GemmPipeline::kPadK,
+                  "Not supported!");
+    static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>,
+                  "Not supported A GEMM layout!");
+    static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::ColumnMajor>,
+                  "Not supported B GEMM layout!");
+    static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>,
+                  "Not supported C GEMM layout!");
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "grouped_convolution_backward_data", gemm_prec_str<InDataType, WeiDataType>, GemmPipeline::GetName());
+        // clang-format on
+    }
+
+    CK_TILE_HOST static auto GridSize(const GroupedConvBwdDataKernelArgsSpecialized& kargs)
+    {
+        // enable batched grouped gemm
+        return dim3(kargs.grid_size_, kargs.GemmBatch, kargs.k_batch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    CK_TILE_HOST static constexpr GroupedConvBwdDataKernelArgsSpecialized
+    MakeKernelArgs(const GroupedConvBwdDataHostArgs& hostArgs)
+    {
+        return GroupedConvBwdDataKernelArgsSpecialized(hostArgs);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    CK_TILE_HOST static bool
+    IsSupportedArgument(const GroupedConvBwdDataKernelArgsSpecialized& kargs)
+    {
+        if constexpr((EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                      is_any_of<OutDataType, fp16_t, bf16_t>::value) ||
+                     !IsSplitKSupported)
+        {
+            if(kargs.k_batch != 1)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
+                }
+                return false;
+            }
+        }
+
+        if(kargs.gemm_count > MaxGroupedGemmGroupsNum)
+        {
+            return false;
+        }
+
+        const index_t ConvK = kargs.wei_g_k_c_xs_lengths[number<1>{}];
+        const index_t ConvC = kargs.wei_g_k_c_xs_lengths[number<2>{}];
+
+        // check ConvSpecialization
+        if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
+                const index_t ConvStride = kargs.conv_filter_strides[i];
+                const index_t LeftPad    = kargs.input_left_pads[i];
+                const index_t RightPad   = kargs.input_right_pads[i];
+
+                if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
+                const index_t LeftPad    = kargs.input_left_pads[i];
+                const index_t RightPad   = kargs.input_right_pads[i];
+
+                if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter3x3)
+        {
+            if(ConvC != 1)
+            {
+                return false;
+            }
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t filter_spatial_dim = kargs.wei_g_k_c_xs_lengths[i + I3];
+
+                if(filter_spatial_dim != I3)
+                {
+                    return false;
+                }
+            }
+        }
+
+        namespace ctc = tensor_layout::convolution;
+
+        if constexpr(std::is_same_v<InLayout, ctc::NWGC> || std::is_same_v<InLayout, ctc::NHWGC> ||
+                     std::is_same_v<InLayout, ctc::NDHWGC>)
+        {
+            // Check access per C
+            if(ConvC % GemmPipeline::GetVectorSizeB() != 0)
+            {
+                CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported input layout!");
+            return false;
+        }
+
+        // check vector access of B
+        // FIXME: layout
+        if constexpr(std::is_same_v<WeiLayout, ctc::GKXC> ||
+                     std::is_same_v<WeiLayout, ctc::GKYXC> ||
+                     std::is_same_v<WeiLayout, ctc::GKZYXC>)
+        {
+            if(ConvC % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported weight layout!");
+            return false;
+        }
+
+        // check vector access of E
+        if constexpr(std::is_same_v<OutLayout, ctc::NWGK> ||
+                     std::is_same_v<OutLayout, ctc::NHWGK> ||
+                     std::is_same_v<OutLayout, ctc::NDHWGK>)
+        {
+            if(ConvK % GemmPipeline::GetVectorSizeA() != 0)
+            {
+                CK_TILE_ERROR("Conv K is not a multiple of vector store size for output image!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported output layout!");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto
+    MakeGemmTensorViews(const OutDataType* a_ptr,
+                        const InDataType* b_ptr,
+                        const std::array<const void*, NumDTensor>& ds_ptr,
+                        WeiDataType* c_ptr,
+                        const GroupedConvBwdDataKernelArgsSpecialized& kargs,
+                        const index_t group_id)
+    {
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteB, "Not implemented!");
+        const auto& a_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(
+                a_ptr,
+                kargs.a_grid_descs_m_k[group_id]); // A: out
+        }();
+
+        const auto& b_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(
+                b_ptr,
+                kargs.b_grid_descs_n_k[group_id]); // B: weight
+        }();
+
+        const auto& c_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(c_ptr,
+                                                                kargs.c_grid_descs_m_n[group_id]);
+        }();
+
+        const auto& ds_tensor_view = generate_tuple(
+            [&](auto i) {
+                static_assert(std::is_same_v<std::tuple_element_t<i, DsLayout>, OutLayout>,
+                              "Not supported!");
+                static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>,
+                              "Not supported!");
+                static_assert(std::is_same_v<std::tuple_element_t<i, DsDataType>, OutDataType>,
+                              "Not supported!");
+
+                return make_tensor_view<address_space_enum::global>(
+                    static_cast<OutDataType*>(ds_ptr[i]), kargs.c_grid_descs_m_n[group_id]);
+            },
+            number<NumDTensor>{});
+
+        return make_tuple(a_tensor_view, b_tensor_view, ds_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
+            return pad_tensor_view(a_tensor_view,
+                                   make_tuple(number<TilePartitioner::MPerBlock>{},
+                                              number<TilePartitioner::KPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        const auto& b_pad_view = [&]() {
+            const auto& b_tensor_view = views.at(I1);
+            return pad_tensor_view(b_tensor_view,
+                                   make_tuple(number<TilePartitioner::NPerBlock>{},
+                                              number<TilePartitioner::KPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        const auto& ds_tensor_view = views.at(I2);
+        const auto& ds_pad_view    = generate_tuple(
+            [&](auto i) {
+                return pad_tensor_view(ds_tensor_view[i],
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<true, true>{});
+            },
+            number<NumDTensor>{});
+
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I3);
+            return pad_tensor_view(c_tensor_view,
+                                   make_tuple(number<TilePartitioner::MPerBlock>{},
+                                              number<TilePartitioner::NPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        return make_tuple(a_pad_view, b_pad_view, ds_pad_view, c_pad_view);
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto MakeGemmTileWindows(const PadView& views,
+                                                   const index_t i_m,
+                                                   const index_t i_n,
+                                                   const index_t i_k = 0)
+    {
+        const auto& a_pad_view  = views.at(I0);
+        const auto& b_pad_view  = views.at(I1);
+        const auto& ds_pad_view = views.at(I2);
+        const auto& c_pad_view  = views.at(I3);
+
+        const auto& a_block_window = [&]() {
+            return make_tile_window(a_pad_view,
+                                    make_tuple(number<TilePartitioner::MPerBlock>{},
+                                               number<TilePartitioner::KPerBlock>{}),
+                                    {i_m, i_k});
+        }();
+
+        const auto& b_block_window = [&]() {
+            return make_tile_window(b_pad_view,
+                                    make_tuple(number<TilePartitioner::NPerBlock>{},
+                                               number<TilePartitioner::KPerBlock>{}),
+                                    {i_n, i_k});
+        }();
+
+        const auto ds_block_window = generate_tuple(
+            [&](auto i) {
+                return make_tile_window(ds_pad_view[i],
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::NPerBlock>{}),
+                                        {i_m, i_n});
+            },
+            number<NumDTensor>{});
+
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(a_block_window, b_block_window, ds_block_window, c_block_window);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs Grouped Convolution Backward Data kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void RunGemm(const OutDataType* a_ptr,
+                                       const InDataType* b_ptr,
+                                       const std::array<const void*, NumDTensor>& ds_ptr,
+                                       WeiDataType* c_ptr,
+                                       void* smem_ptr_0,
+                                       const GroupedConvBwdDataKernelArgsSpecialized& kargs,
+                                       const index_t block_idx_m,
+                                       const index_t block_idx_n,
+                                       const index_t group_id)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, ds_ptr, c_ptr, kargs, group_id);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(
+            gemm_pad_views.at(I0).get_tensor_descriptor().get_length(I1)));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The starting pointer of 1st shared memory block.
+     * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
+     * @param kargs Grouped Convolution Backward Data kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void RunGemm2LDS(const OutDataType* a_ptr,
+                                           const InDataType* b_ptr,
+                                           const std::array<const void*, NumDTensor>& ds_ptr,
+                                           WeiDataType* c_ptr,
+                                           void* __restrict__ smem_ptr_0,
+                                           void* __restrict__ smem_ptr_1,
+                                           const GroupedConvBwdDataKernelArgsSpecialized& kargs,
+                                           const index_t block_idx_m,
+                                           const index_t block_idx_n,
+                                           const index_t group_id)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, ds_ptr, c_ptr, kargs, group_id);
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(gemm_tile_windows.at(I0).get_length(I1)));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
+    }
+
+    CK_TILE_DEVICE index_t FindGroupId(const GroupedConvBwdDataKernelArgsSpecialized& kargs,
+                                       index_t block_id) const
+    {
+        index_t left     = 0;
+        index_t right    = kargs.gemm_count;
+        index_t group_id = index_t((left + right) >> 1);
+
+        while((!(block_id >= kargs.block_starts[group_id] &&
+                 block_id < kargs.block_ends[group_id])) &&
+              left <= right)
+        {
+            if(block_id < kargs.block_starts[group_id])
+            {
+                right = group_id;
+            }
+            else
+            {
+                left = group_id;
+            }
+            group_id = index_t((left + right) >> 1);
+        }
+
+        return group_id;
+    }
+
+    CK_TILE_DEVICE void operator()(GroupedConvBwdDataKernelArgsSpecialized kargs) const
+    {
+        const auto blockIdX    = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const index_t group_id = FindGroupId(kargs, blockIdX);
+
+        const auto [iM, iN] = OffsettedTile1DPartitioner<TilePartitioner>::GetOffsetedTileIndex(
+            kargs.block_starts[group_id],
+            kargs.c_grid_descs_m_n[group_id].get_length(I0),
+            kargs.c_grid_descs_m_n[group_id].get_length(I1));
+
+        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+
+        // options
+        // conv_bwd_data = Out * Weight = In
+        const OutDataType* a_ptr = static_cast<const OutDataType*>(kargs.out_ptr) + group_offset_a;
+        const WeiDataType* b_ptr = static_cast<const WeiDataType*>(kargs.wei_ptr) + group_offset_b;
+        InDataType* c_ptr        = static_cast<InDataType*>(kargs.in_ptr) + group_offset_c;
+
+        // allocate LDS
+        __shared__ char smem_ptr_0[GetSmemSize()];
+
+        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+        {
+            __shared__ char smem_ptr_1[GetSmemSize()];
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+            {
+                RunGemm2LDS(a_ptr,
+                            b_ptr,
+                            kargs.ds_ptr,
+                            c_ptr,
+                            smem_ptr_0,
+                            smem_ptr_1,
+                            kargs,
+                            i_m,
+                            i_n,
+                            group_id);
+            }
+        }
+        else
+        {
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+            {
+                RunGemm(a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr_0, kargs, i_m, i_n, group_id);
+            }
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
index 115f6dea19..2700353049 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -17,19 +17,19 @@
 namespace ck_tile {
 
 /// @brief The Grouped Convolution kernel device arguments.
-template <typename GroupedConvTraitsType>
+template <typename GroupedConvTraitsType_>
 struct GroupedConvBwdWeightKernelArgs
 {
 
     using ConvToGemmTransformer =
-        TransformConvBwdWeightToGemm<GroupedConvTraitsType::NDimSpatial,
-                                     GroupedConvTraitsType::ConvSpecialization>;
-    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+        TransformConvBwdWeightToGemm<GroupedConvTraitsType_::NDimSpatial,
+                                     GroupedConvTraitsType_::ConvSpecialization>;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
 
     template <
-        typename InLay                      = typename GroupedConvTraitsType::InLayout,
-        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
-        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename InLay                      = typename GroupedConvTraitsType_::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType_::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType_::OutLayout,
         typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NWGC> &&
                                     std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
                                     std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
@@ -75,7 +75,7 @@ struct GroupedConvBwdWeightKernelArgs
         // tuple
         auto grid_descs =
             conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<
-                GroupedConvTraitsType::NDimSpatial>();
+                GroupedConvTraitsType_::NDimSpatial>();
 
         a_grid_desc_m_k = grid_descs.at(number<0>{});
         b_grid_desc_n_k = grid_descs.at(number<1>{});
@@ -96,9 +96,9 @@ struct GroupedConvBwdWeightKernelArgs
     }
 
     template <
-        typename InLay                      = typename GroupedConvTraitsType::InLayout,
-        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
-        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename InLay                      = typename GroupedConvTraitsType_::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType_::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType_::OutLayout,
         typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NHWGC> &&
                                     std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
                                     std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
@@ -151,7 +151,7 @@ struct GroupedConvBwdWeightKernelArgs
         // tuple
         auto grid_descs =
             conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<
-                GroupedConvTraitsType::NDimSpatial>();
+                GroupedConvTraitsType_::NDimSpatial>();
 
         a_grid_desc_m_k = grid_descs.at(number<0>{});
         b_grid_desc_n_k = grid_descs.at(number<1>{});
@@ -172,9 +172,9 @@ struct GroupedConvBwdWeightKernelArgs
     }
 
     template <
-        typename InLay                      = typename GroupedConvTraitsType::InLayout,
-        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
-        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename InLay                      = typename GroupedConvTraitsType_::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType_::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType_::OutLayout,
         typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NDHWGC> &&
                                     std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
                                     std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
@@ -234,7 +234,7 @@ struct GroupedConvBwdWeightKernelArgs
         // tuple
         auto grid_descs =
             conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<
-                GroupedConvTraitsType::NDimSpatial>();
+                GroupedConvTraitsType_::NDimSpatial>();
 
         a_grid_desc_m_k = grid_descs.at(number<0>{});
         b_grid_desc_n_k = grid_descs.at(number<1>{});
@@ -263,14 +263,14 @@ struct GroupedConvBwdWeightKernelArgs
     using CGridDescMN = remove_cvref_t<decltype(ABCGridDescs{}[number<2>{}])>;
 
     static constexpr index_t NonSpatialDims = 3;
-    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> in_g_n_c_wis_lengths;
-    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> wei_g_k_c_xs_lengths;
-    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> out_g_n_k_wos_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType_::NDimSpatial> in_g_n_c_wis_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType_::NDimSpatial> wei_g_k_c_xs_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType_::NDimSpatial> out_g_n_k_wos_lengths;
 
-    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_strides;
-    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_dilations;
-    array<index_t, GroupedConvTraitsType::NDimSpatial> input_left_pads;
-    array<index_t, GroupedConvTraitsType::NDimSpatial> input_right_pads;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> conv_filter_strides;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> conv_filter_dilations;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> input_left_pads;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> input_right_pads;
 
     index_t k_batch;
     index_t GemmM;
@@ -292,12 +292,12 @@ struct GroupedConvBwdWeightKernelArgs
     long_index_t group_stride_c;
 };
 
-/// @brief The Grouped Convolution Forward kernel template.
+/// @brief The Grouped Convolution Backward Weight kernel template.
 ///
 /// @paragraph Overview Overview
-///            This class provides the grouped convolution forward kernel template. By semantic
-///            division of Implicit GEMM algorithm into following parts we achieve flexible,
-///            versatile and robust kernel implementation.
+///            This class provides the grouped convolution backward weight kernel template. By
+///            semantic division of Implicit GEMM algorithm into following parts we achieve
+///            flexible, versatile and robust kernel implementation.
 ///
 ///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
 ///                function call operator" which determines the work scope of each workgroup.
@@ -315,7 +315,7 @@ struct GroupedConvBwdWeightKernelArgs
 ///            the policy is responsible for definition of all necessary data layouts and thread's
 ///            work distribution.
 ///
-/// tparam ConvSpecialization  Tensor descriptors specialization.
+/// @tparam GroupedConvTraitsType_       The type of class providing traits for grouped convolution.
 /// @tparam TilePartitioner_            The type of class providing mapping of workgroup index into
 /// the
 ///                                     output data tile to be calculated. It determines the
@@ -330,15 +330,15 @@ struct GroupedConvBwdWeightKernelArgs
 ///                                     multiplication implementation. It is responsible for storing
 ///                                     results calculated by @ref GemmPipeline_ "GemmPipeline" to
 ///                                     the output C tensor in global memory.
-template <typename GroupedConvTraitsType,
+template <typename GroupedConvTraitsType_,
           typename TilePartitioner_,
           typename GemmPipeline_,
           typename EpiloguePipeline_>
 struct GroupedConvolutionBackwardWeightKernel
 {
-    static constexpr index_t NDimSpatial = GroupedConvTraitsType::NDimSpatial_;
+    static constexpr index_t NDimSpatial = GroupedConvTraitsType_::NDimSpatial_;
     static constexpr ConvolutionSpecialization ConvSpecialization =
-        GroupedConvTraitsType::ConvSpecialization;
+        GroupedConvTraitsType_::ConvSpecialization;
     using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
     using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
     using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
@@ -346,15 +346,15 @@ struct GroupedConvolutionBackwardWeightKernel
     using GemmBLayout      = remove_cvref_t<typename GemmPipeline::BLayout>;
     using GemmCLayout      = remove_cvref_t<typename GemmPipeline::CLayout>;
 
-    using InLayout  = remove_cvref_t<typename GroupedConvTraitsType::InLayout>;
-    using WeiLayout = remove_cvref_t<typename GroupedConvTraitsType::WeiLayout>;
-    using OutLayout = remove_cvref_t<typename GroupedConvTraitsType::OutLayout>;
-    using DsLayout  = remove_cvref_t<typename GroupedConvTraitsType::DsLayout>;
+    using InLayout  = remove_cvref_t<typename GroupedConvTraitsType_::InLayout>;
+    using WeiLayout = remove_cvref_t<typename GroupedConvTraitsType_::WeiLayout>;
+    using OutLayout = remove_cvref_t<typename GroupedConvTraitsType_::OutLayout>;
+    using DsLayout  = remove_cvref_t<typename GroupedConvTraitsType_::DsLayout>;
 
     using GemmDsLayout                  = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
-    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
 
-    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+    static constexpr index_t kBlockSize = GemmPipeline::BlockSize;
 
     using InDataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
     using WeiDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
@@ -363,7 +363,7 @@ struct GroupedConvolutionBackwardWeightKernel
     using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
     using GroupedConvBwdWeightKernelArgsSpecialized =
-        GroupedConvBwdWeightKernelArgs<GroupedConvTraitsType>;
+        GroupedConvBwdWeightKernelArgs<GroupedConvTraitsType_>;
 
     // TODO: Enable this
     static constexpr bool IsSplitKSupported = true;
@@ -393,7 +393,7 @@ struct GroupedConvolutionBackwardWeightKernel
             TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN), kargs.GemmBatch, kargs.k_batch);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
 
     CK_TILE_HOST static constexpr GroupedConvBwdWeightKernelArgsSpecialized
     MakeKernelArgs(const GroupedConvBwdWeightHostArgs& hostArgs)
@@ -594,12 +594,9 @@ struct GroupedConvolutionBackwardWeightKernel
         }();
 
         const auto& c_tensor_view = [&]() {
-            return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+            return make_tensor_view<address_space_enum::global, DstInMemOp>(
                 c_ptr,
-                make_tuple(kargs.GemmM, kargs.GemmN),
-                make_tuple(kargs.GemmN, 1),
-                number<EpiloguePipeline::GetVectorSizeC()>{},
-                number<1>{});
+                kargs.c_grid_desc_m_n); // B: in
         }();
 
         const auto& ds_tensor_view = generate_tuple(
@@ -708,7 +705,7 @@ struct GroupedConvolutionBackwardWeightKernel
      * @param b_ptr input B pointer
      * @param c_ptr output C pointer
      * @param smem_ptr_0 The start memory pointer of the shared memory block.
-     * @param kargs Grouped Convolution Forward kernel arguments
+     * @param kargs Grouped Convolution Backward Weight kernel arguments
      * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
      * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
      *
@@ -758,7 +755,7 @@ struct GroupedConvolutionBackwardWeightKernel
      * @param c_ptr output C pointer
      * @param smem_ptr_0 The starting pointer of 1st shared memory block.
      * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
-     * @param kargs Grouped Convolution Forward kernel arguments
+     * @param kargs Grouped Convolution Backward Weight kernel arguments
      * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
      * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
      *
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
index 8cd1710043..d4f4eca0d0 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -17,19 +17,19 @@
 namespace ck_tile {
 
 /// @brief The Grouped Convolution kernel device arguments.
-template <typename GroupedConvTraitsType>
+template <typename GroupedConvTraitsType_>
 struct GroupedConvFwdKernelArgs
 {
 
     using ConvToGemmFwdTransformer =
-        TransformConvFwdToGemm<GroupedConvTraitsType::NDimSpatial,
-                               GroupedConvTraitsType::ConvSpecialization>;
-    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+        TransformConvFwdToGemm<GroupedConvTraitsType_::NDimSpatial,
+                               GroupedConvTraitsType_::ConvSpecialization>;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
 
     template <
-        typename InLay                      = typename GroupedConvTraitsType::InLayout,
-        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
-        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename InLay                      = typename GroupedConvTraitsType_::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType_::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType_::OutLayout,
         typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NWGC> &&
                                     std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
                                     std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
@@ -79,13 +79,13 @@ struct GroupedConvFwdKernelArgs
 
         a_grid_desc_m_k =
             conv_to_gemm_transformer
-                .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>();
+                .template MakeADescriptor_M_K<typename GroupedConvTraitsType_::InLayout>();
         b_grid_desc_n_k =
             conv_to_gemm_transformer
-                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>();
+                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType_::WeiLayout>();
         c_grid_desc_m_n =
             conv_to_gemm_transformer
-                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>();
+                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType_::OutLayout>();
 
         group_stride_a = args.C_;
         group_stride_b = args.K_ * args.C_ *
@@ -97,9 +97,9 @@ struct GroupedConvFwdKernelArgs
     }
 
     template <
-        typename InLay                      = typename GroupedConvTraitsType::InLayout,
-        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
-        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename InLay                      = typename GroupedConvTraitsType_::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType_::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType_::OutLayout,
         typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NHWGC> &&
                                     std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
                                     std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
@@ -156,13 +156,13 @@ struct GroupedConvFwdKernelArgs
 
         a_grid_desc_m_k =
             conv_to_gemm_transformer
-                .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>();
+                .template MakeADescriptor_M_K<typename GroupedConvTraitsType_::InLayout>();
         b_grid_desc_n_k =
             conv_to_gemm_transformer
-                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>();
+                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType_::WeiLayout>();
         c_grid_desc_m_n =
             conv_to_gemm_transformer
-                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>();
+                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType_::OutLayout>();
 
         group_stride_a = args.C_;
         group_stride_b = args.K_ * args.C_ *
@@ -174,9 +174,9 @@ struct GroupedConvFwdKernelArgs
     }
 
     template <
-        typename InLay                      = typename GroupedConvTraitsType::InLayout,
-        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
-        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename InLay                      = typename GroupedConvTraitsType_::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType_::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType_::OutLayout,
         typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NDHWGC> &&
                                     std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
                                     std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
@@ -242,13 +242,13 @@ struct GroupedConvFwdKernelArgs
 
         a_grid_desc_m_k =
             conv_to_gemm_transformer
-                .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>();
+                .template MakeADescriptor_M_K<typename GroupedConvTraitsType_::InLayout>();
         b_grid_desc_n_k =
             conv_to_gemm_transformer
-                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>();
+                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType_::WeiLayout>();
         c_grid_desc_m_n =
             conv_to_gemm_transformer
-                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>();
+                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType_::OutLayout>();
 
         group_stride_a = args.C_;
         group_stride_b = args.K_ * args.C_ *
@@ -261,23 +261,23 @@ struct GroupedConvFwdKernelArgs
 
     using AGridDescMK = remove_cvref_t<
         decltype(ConvToGemmFwdTransformer{}
-                     .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>())>;
+                     .template MakeADescriptor_M_K<typename GroupedConvTraitsType_::InLayout>())>;
     using BGridDescNK = remove_cvref_t<
         decltype(ConvToGemmFwdTransformer{}
-                     .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>())>;
+                     .template MakeBDescriptor_N_K<typename GroupedConvTraitsType_::WeiLayout>())>;
     using CGridDescMN = remove_cvref_t<
         decltype(ConvToGemmFwdTransformer{}
-                     .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>())>;
+                     .template MakeCDescriptor_M_N<typename GroupedConvTraitsType_::OutLayout>())>;
 
     static constexpr index_t NonSpatialDims = 3;
-    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> in_g_n_c_wis_lengths;
-    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> wei_g_k_c_xs_lengths;
-    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> out_g_n_k_wos_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType_::NDimSpatial> in_g_n_c_wis_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType_::NDimSpatial> wei_g_k_c_xs_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType_::NDimSpatial> out_g_n_k_wos_lengths;
 
-    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_strides;
-    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_dilations;
-    array<index_t, GroupedConvTraitsType::NDimSpatial> input_left_pads;
-    array<index_t, GroupedConvTraitsType::NDimSpatial> input_right_pads;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> conv_filter_strides;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> conv_filter_dilations;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> input_left_pads;
+    array<index_t, GroupedConvTraitsType_::NDimSpatial> input_right_pads;
 
     index_t k_batch;
     index_t GemmM;
@@ -322,7 +322,7 @@ struct GroupedConvFwdKernelArgs
 ///            the policy is responsible for definition of all necessary data layouts and thread's
 ///            work distribution.
 ///
-/// @tparam GroupedConvTraitsType       The type of class providing traits for grouped convolution.
+/// @tparam GroupedConvTraitsType_       The type of class providing traits for grouped convolution.
 /// @tparam TilePartitioner_            The type of class providing mapping of workgroup index into
 /// the
 ///                                     output data tile to be calculated. It determines the
@@ -337,15 +337,15 @@ struct GroupedConvFwdKernelArgs
 ///                                     multiplication implementation. It is responsible for storing
 ///                                     results calculated by @ref GemmPipeline_ "GemmPipeline" to
 ///                                     the output C tensor in global memory.
-template <typename GroupedConvTraitsType,
+template <typename GroupedConvTraitsType_,
           typename TilePartitioner_,
           typename GemmPipeline_,
           typename EpiloguePipeline_>
 struct GroupedConvolutionForwardKernel
 {
-    static constexpr index_t NDimSpatial = GroupedConvTraitsType::NDimSpatial;
+    static constexpr index_t NDimSpatial = GroupedConvTraitsType_::NDimSpatial;
     static constexpr ConvolutionSpecialization ConvSpecialization =
-        GroupedConvTraitsType::ConvSpecialization;
+        GroupedConvTraitsType_::ConvSpecialization;
     using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
     using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
     using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
@@ -353,15 +353,15 @@ struct GroupedConvolutionForwardKernel
     using GemmBLayout      = remove_cvref_t<typename GemmPipeline::BLayout>;
     using GemmCLayout      = remove_cvref_t<typename GemmPipeline::CLayout>;
 
-    using InLayout  = remove_cvref_t<typename GroupedConvTraitsType::InLayout>;
-    using WeiLayout = remove_cvref_t<typename GroupedConvTraitsType::WeiLayout>;
-    using OutLayout = remove_cvref_t<typename GroupedConvTraitsType::OutLayout>;
-    using DsLayout  = remove_cvref_t<typename GroupedConvTraitsType::DsLayout>;
+    using InLayout  = remove_cvref_t<typename GroupedConvTraitsType_::InLayout>;
+    using WeiLayout = remove_cvref_t<typename GroupedConvTraitsType_::WeiLayout>;
+    using OutLayout = remove_cvref_t<typename GroupedConvTraitsType_::OutLayout>;
+    using DsLayout  = remove_cvref_t<typename GroupedConvTraitsType_::DsLayout>;
 
     using GemmDsLayout                  = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
-    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
 
-    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+    static constexpr index_t kBlockSize = GemmPipeline::BlockSize;
 
     using InDataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
     using WeiDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
@@ -369,7 +369,7 @@ struct GroupedConvolutionForwardKernel
     // Below type is actually accumulation data type - the output of block GEMM.
     using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
-    using GroupedConvFwdKernelArgsSpecialized = GroupedConvFwdKernelArgs<GroupedConvTraitsType>;
+    using GroupedConvFwdKernelArgsSpecialized = GroupedConvFwdKernelArgs<GroupedConvTraitsType_>;
 
     // TODO: Enable this
     static constexpr bool IsSplitKSupported = false;
@@ -398,7 +398,7 @@ struct GroupedConvolutionForwardKernel
             TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN), kargs.GemmBatch, kargs.k_batch);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
 
     CK_TILE_HOST static constexpr GroupedConvFwdKernelArgsSpecialized
     MakeKernelArgs(const GroupedConvFwdHostArgs& hostArgs)
diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
index b173ab25a1..3e5e87a975 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -42,6 +42,7 @@ struct GroupedConvHostArgs : public conv::ConvParam
 
 using GroupedConvFwdHostArgs       = GroupedConvHostArgs<const void*, const void*, void*>;
 using GroupedConvBwdWeightHostArgs = GroupedConvHostArgs<const void*, void*, const void*>;
+using GroupedConvBwdDataHostArgs   = GroupedConvHostArgs<void*, const void*, const void*>;
 
 template <index_t NDimSpatial_,
           ConvolutionSpecialization ConvSpecialization_,
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp
new file mode 100644
index 0000000000..972d05ff3e
--- /dev/null
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp
@@ -0,0 +1,1064 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
+
+namespace ck_tile {
+
+template <index_t NDimSpatial,
+          ConvolutionSpecialization ConvolutionSpecialization,
+          bool SplitN              = false,
+          typename ADataType       = float,
+          typename CDataType       = float,
+          index_t NumGroupsToMerge = 1,
+          typename IndexType       = index_t>
+struct TransformConvBwdDataToGemm
+{
+    private:
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+    static constexpr auto I3 = number<3>{};
+    static constexpr auto I4 = number<4>{};
+    static constexpr auto I5 = number<5>{};
+#if 0 // TODO: Enable these functionalities
+    template <typename ConvDimsType>
+    static long_index_t calculate_element_space_size_impl(const ConvDimsType& lengths,
+                                                          const ConvDimsType& strides,
+                                                          index_t i)
+    {
+        long_index_t acc = 1;
+        for(; i < (NDimSpatial + 3); i++)
+        {
+            acc +=
+                static_cast<long_index_t>(lengths[i] - I1) * static_cast<long_index_t>(strides[i]);
+        }
+
+        return acc;
+    }
+
+    template <typename ConvDimsType>
+    static IndexType GetSplitedNSize(const ConvDimsType& a_g_n_c_wis_lengths,
+                                     const ConvDimsType& a_g_n_c_wis_strides,
+                                     const ConvDimsType& c_g_n_k_wos_lengths,
+                                     const ConvDimsType& c_g_n_k_wos_strides)
+    {
+        const long_index_t a_element_space_size =
+            calculate_element_space_size_impl(a_g_n_c_wis_lengths, a_g_n_c_wis_strides, I1);
+        const long_index_t c_element_space_size =
+            calculate_element_space_size_impl(c_g_n_k_wos_lengths, c_g_n_k_wos_strides, I1);
+        const long_index_t element_space_size = math::max(a_element_space_size * sizeof(ADataType),
+                                                          c_element_space_size * sizeof(CDataType));
+        constexpr long_index_t TwoGB          = (long_index_t{1} << 31);
+
+        const IndexType N = a_g_n_c_wis_lengths[I1];
+
+        if(element_space_size > TwoGB)
+        {
+            // Minimum divisor of N to not exceed 2GB
+            const auto divisor = math::integer_divide_ceil(element_space_size, TwoGB);
+
+            if(divisor <= static_cast<double>(N))
+            {
+                // Find least divisor of N larger than element_space_size / TwoGB
+                // Iterate up to sqrt(N). There are no divisors above this value.
+                for(IndexType least_divisor = divisor; least_divisor * least_divisor <= N;
+                    least_divisor++)
+                {
+                    if(N % least_divisor == 0)
+                    {
+                        return N / least_divisor;
+                    }
+                }
+                // Not found, process one Convolution N per block
+                return 1;
+            }
+            else
+            {
+                // Split Convolution's N dimension into N workgroups. However
+                // this still might not result in sufficiently small tensor,
+                // but at least later on we could divide the image as well.
+                return 1;
+            }
+        }
+        else
+        {
+            // Split N is not needed.
+            return N;
+        }
+    }
+#endif
+
+    public:
+    CK_TILE_HOST constexpr TransformConvBwdDataToGemm() {}
+
+    template <typename TransformConvBwdDataToGemmBase>
+    CK_TILE_HOST
+    TransformConvBwdDataToGemm(const TransformConvBwdDataToGemmBase& transform_conv_to_gemm_base)
+        : G_{static_cast<IndexType>(transform_conv_to_gemm_base.G_)},
+          N_{static_cast<IndexType>(transform_conv_to_gemm_base.N_)},
+          Di_{static_cast<IndexType>(transform_conv_to_gemm_base.Di_)},
+          Hi_{static_cast<IndexType>(transform_conv_to_gemm_base.Hi_)},
+          Wi_{static_cast<IndexType>(transform_conv_to_gemm_base.Wi_)},
+          Do_{static_cast<IndexType>(transform_conv_to_gemm_base.Do_)},
+          Ho_{static_cast<IndexType>(transform_conv_to_gemm_base.Ho_)},
+          Wo_{static_cast<IndexType>(transform_conv_to_gemm_base.Wo_)},
+          Z_{static_cast<IndexType>(transform_conv_to_gemm_base.Z_)},
+          Y_{static_cast<IndexType>(transform_conv_to_gemm_base.Y_)},
+          X_{static_cast<IndexType>(transform_conv_to_gemm_base.X_)},
+          K_{static_cast<IndexType>(transform_conv_to_gemm_base.K_)},
+          C_{static_cast<IndexType>(transform_conv_to_gemm_base.C_)},
+          ConvStrideD_{static_cast<IndexType>(transform_conv_to_gemm_base.ConvStrideD_)},
+          ConvStrideH_{static_cast<IndexType>(transform_conv_to_gemm_base.ConvStrideH_)},
+          ConvStrideW_{static_cast<IndexType>(transform_conv_to_gemm_base.ConvStrideW_)},
+          ConvDilationD_{static_cast<IndexType>(transform_conv_to_gemm_base.ConvDilationD_)},
+          ConvDilationH_{static_cast<IndexType>(transform_conv_to_gemm_base.ConvDilationH_)},
+          ConvDilationW_{static_cast<IndexType>(transform_conv_to_gemm_base.ConvDilationW_)},
+          InLeftPadD_{static_cast<IndexType>(transform_conv_to_gemm_base.InLeftPadD_)},
+          InLeftPadH_{static_cast<IndexType>(transform_conv_to_gemm_base.InLeftPadH_)},
+          InLeftPadW_{static_cast<IndexType>(transform_conv_to_gemm_base.InLeftPadW_)},
+          InRightPadD_{static_cast<IndexType>(transform_conv_to_gemm_base.InRightPadD_)},
+          InRightPadH_{static_cast<IndexType>(transform_conv_to_gemm_base.InRightPadH_)},
+          InRightPadW_{static_cast<IndexType>(transform_conv_to_gemm_base.InRightPadW_)}
+    {
+    }
+
+    template <typename ConvDimsType,
+              typename ConvSpatialDimsType,
+              index_t NDim                                   = NDimSpatial,
+              typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST TransformConvBwdDataToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
+                                            const ConvDimsType& b_g_k_c_xs_lengths,
+                                            const ConvDimsType& c_g_n_k_wos_lengths,
+                                            const ConvSpatialDimsType& conv_filter_strides,
+                                            const ConvSpatialDimsType& conv_filter_dilations,
+                                            const ConvSpatialDimsType& input_left_pads,
+                                            const ConvSpatialDimsType& input_right_pads,
+                                            const ConvSpatialDimsType& tildes)
+        : G_{a_g_n_c_wis_lengths[I0]},
+          N_{a_g_n_c_wis_lengths[I1]},
+          Di_{I1},
+          Hi_{I1},
+          Wi_{a_g_n_c_wis_lengths[I3]},
+          Do_{I1},
+          Ho_{I1},
+          Wo_{c_g_n_k_wos_lengths[I3]},
+          Z_{I1},
+          Y_{I1},
+          X_{b_g_k_c_xs_lengths[I3]},
+          K_{c_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          ConvStrideD_{I1},
+          ConvStrideH_{I1},
+          ConvStrideW_{conv_filter_strides[I0]},
+          ConvDilationD_{I1},
+          ConvDilationH_{I1},
+          ConvDilationW_{conv_filter_dilations[I0]},
+          InLeftPadD_{I0},
+          InLeftPadH_{I0},
+          InLeftPadW_{input_left_pads[I0]},
+          InRightPadD_{I0},
+          InRightPadH_{I0},
+          InRightPadW_{input_right_pads[I0]},
+          IdxZTilde_{I1},
+          IdxYTilde_{I1},
+          IdxXTilde_{tildes[I0]}
+    {
+#if 0 // TODO: Enable these functionalities
+        if constexpr(SplitN)
+        {
+            N_ = GetSplitedNSize(
+                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+        }
+        else
+        {
+            N_ = c_g_n_k_wos_lengths[I1];
+        }
+#endif
+
+        GcdStrideDilationW_ = gcd(ConvStrideW_, ConvDilationW_);
+        XTilde_             = ConvStrideW_ / GcdStrideDilationW_;
+        WTilde_             = Wo_ + integer_divide_ceil(ConvDilationW_ * (X_ - I1), ConvStrideW_);
+        XDot_               = integer_divide_ceil(X_, XTilde_);
+    }
+
+    template <typename ConvDimsType,
+              typename ConvSpatialDimsType,
+              index_t NDim                                   = NDimSpatial,
+              typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST TransformConvBwdDataToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
+                                            const ConvDimsType& b_g_k_c_xs_lengths,
+                                            const ConvDimsType& c_g_n_k_wos_lengths,
+                                            const ConvSpatialDimsType& conv_filter_strides,
+                                            const ConvSpatialDimsType& conv_filter_dilations,
+                                            const ConvSpatialDimsType& input_left_pads,
+                                            const ConvSpatialDimsType& input_right_pads,
+                                            const ConvSpatialDimsType& tildes)
+        : G_{a_g_n_c_wis_lengths[I0]},
+          N_{a_g_n_c_wis_lengths[I1]},
+          Di_{I1},
+          Hi_{a_g_n_c_wis_lengths[I3]},
+          Wi_{a_g_n_c_wis_lengths[I4]},
+          Do_{I1},
+          Ho_{c_g_n_k_wos_lengths[I3]},
+          Wo_{c_g_n_k_wos_lengths[I4]},
+          Z_{I1},
+          Y_{b_g_k_c_xs_lengths[I3]},
+          X_{b_g_k_c_xs_lengths[I4]},
+          K_{c_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          ConvStrideD_{I1},
+          ConvStrideH_{conv_filter_strides[I0]},
+          ConvStrideW_{conv_filter_strides[I1]},
+          ConvDilationD_{I1},
+          ConvDilationH_{conv_filter_dilations[I0]},
+          ConvDilationW_{conv_filter_dilations[I1]},
+          InLeftPadD_{I0},
+          InLeftPadH_{input_left_pads[I0]},
+          InLeftPadW_{input_left_pads[I1]},
+          InRightPadD_{I0},
+          InRightPadH_{input_right_pads[I0]},
+          InRightPadW_{input_right_pads[I1]},
+          IdxZTilde_{I1},
+          IdxYTilde_{tildes[I0]},
+          IdxXTilde_{tildes[I1]}
+    {
+#if 0 // TODO: Enable these functionalities
+        if constexpr(SplitN)
+        {
+            N_ = GetSplitedNSize(
+                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+        }
+        else
+        {
+            N_ = c_g_n_k_wos_lengths[I1];
+        }
+#endif
+        GcdStrideDilationW_ = gcd(ConvStrideW_, ConvDilationW_);
+        GcdStrideDilationH_ = gcd(ConvStrideH_, ConvDilationH_);
+        XTilde_             = ConvStrideW_ / GcdStrideDilationW_;
+        YTilde_             = ConvStrideH_ / GcdStrideDilationH_;
+        WTilde_             = Wo_ + integer_divide_ceil(ConvDilationW_ * (X_ - I1), ConvStrideW_);
+        HTilde_             = Ho_ + integer_divide_ceil(ConvDilationH_ * (Y_ - I1), ConvStrideH_);
+        XDot_               = integer_divide_ceil(X_, XTilde_);
+        YDot_               = integer_divide_ceil(Y_, YTilde_);
+    }
+
+    template <typename ConvDimsType,
+              typename ConvSpatialDimsType,
+              index_t NDim                                   = NDimSpatial,
+              typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST TransformConvBwdDataToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
+                                            const ConvDimsType& b_g_k_c_xs_lengths,
+                                            const ConvDimsType& c_g_n_k_wos_lengths,
+                                            const ConvSpatialDimsType& conv_filter_strides,
+                                            const ConvSpatialDimsType& conv_filter_dilations,
+                                            const ConvSpatialDimsType& input_left_pads,
+                                            const ConvSpatialDimsType& input_right_pads,
+                                            [[maybe_unused]] const ConvSpatialDimsType& tildes)
+        : G_{a_g_n_c_wis_lengths[I0]},
+          N_{a_g_n_c_wis_lengths[I1]},
+          Di_{a_g_n_c_wis_lengths[I3]},
+          Hi_{a_g_n_c_wis_lengths[I4]},
+          Wi_{a_g_n_c_wis_lengths[I5]},
+          Do_{c_g_n_k_wos_lengths[I3]},
+          Ho_{c_g_n_k_wos_lengths[I4]},
+          Wo_{c_g_n_k_wos_lengths[I5]},
+          Z_{b_g_k_c_xs_lengths[I3]},
+          Y_{b_g_k_c_xs_lengths[I4]},
+          X_{b_g_k_c_xs_lengths[I5]},
+          K_{c_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          ConvStrideD_{conv_filter_strides[I0]},
+          ConvStrideH_{conv_filter_strides[I1]},
+          ConvStrideW_{conv_filter_strides[I2]},
+          ConvDilationD_{conv_filter_dilations[I0]},
+          ConvDilationH_{conv_filter_dilations[I1]},
+          ConvDilationW_{conv_filter_dilations[I2]},
+          InLeftPadD_{input_left_pads[I0]},
+          InLeftPadH_{input_left_pads[I1]},
+          InLeftPadW_{input_left_pads[I2]},
+          InRightPadD_{input_right_pads[I0]},
+          InRightPadH_{input_right_pads[I1]},
+          InRightPadW_{input_right_pads[I2]},
+          IdxZTilde_{tildes[I0]},
+          IdxYTilde_{tildes[I1]},
+          IdxXTilde_{tildes[I2]}
+    {
+#if 0 // TODO: Enable these functionalities
+        if constexpr(SplitN)
+        {
+            N_ = GetSplitedNSize(
+                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+        }
+        else
+        {
+            N_ = c_g_n_k_wos_lengths[I1];
+        }
+#endif
+        GcdStrideDilationW_ = gcd(ConvStrideW_, ConvDilationW_);
+        GcdStrideDilationH_ = gcd(ConvStrideH_, ConvDilationH_);
+        GcdStrideDilationD_ = gcd(ConvStrideD_, ConvDilationD_);
+        XTilde_             = ConvStrideW_ / GcdStrideDilationW_;
+        YTilde_             = ConvStrideH_ / GcdStrideDilationH_;
+        ZTilde_             = ConvStrideD_ / GcdStrideDilationD_;
+        WTilde_             = Wo_ + integer_divide_ceil(ConvDilationW_ * (X_ - I1), ConvStrideW_);
+        HTilde_             = Ho_ + integer_divide_ceil(ConvDilationH_ * (Y_ - I1), ConvStrideH_);
+        DTilde_             = Do_ + integer_divide_ceil(ConvDilationD_ * (Z_ - I1), ConvStrideD_);
+        XDot_               = integer_divide_ceil(X_, XTilde_);
+        YDot_               = integer_divide_ceil(Y_, YTilde_);
+        ZDot_               = integer_divide_ceil(Z_, ZTilde_);
+    }
+
+#if 0 // TODO: Enable these functionalities
+    __host__ bool AreDescriptorsSmallerThan2GB() const
+    {
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        const long_index_t in_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorA_ + (Di_ - I1) * DiStride_ + (Hi_ - I1) * HiStride_ +
+            (Wi_ - I1) * WiStride_ + (C_ - I1) * CStrideTensorA_;
+        const long_index_t out_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorC_ + (Do_ - I1) * DoStride_ + (Ho_ - I1) * HoStride_ +
+            (Wo_ - I1) * WoStride_ + (K_ - I1) * KStrideTensorC_;
+
+        bool is_a_descriptor_smaller_than_2GB = (in_desc_space_size * sizeof(ADataType)) <= TwoGB;
+        bool is_c_descriptor_smaller_than_2GB = (out_desc_space_size * sizeof(CDataType)) <= TwoGB;
+
+        return is_a_descriptor_smaller_than_2GB && is_c_descriptor_smaller_than_2GB;
+    }
+
+    __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
+                                   CDataType* c_grid_ptr_base) const
+    {
+        // Create copies
+        auto conv_to_gemm_transformer_left  = *this;
+        auto conv_to_gemm_transformer_right = *this;
+        IndexType a_right_offset            = 0;
+        IndexType c_right_offset            = 0;
+        // Calculate real filter size
+        const IndexType z_eff = (Z_ - 1) * ConvDilationD_ + 1;
+        const IndexType y_eff = (Y_ - 1) * ConvDilationH_ + 1;
+        const IndexType x_eff = (X_ - 1) * ConvDilationW_ + 1;
+        // Calculate start position in input for right tensor
+        const IndexType di_right_transformer_start_idx = (Do_ / 2) * ConvStrideD_;
+        const IndexType hi_right_transformer_start_idx = (Ho_ / 2) * ConvStrideH_;
+        const IndexType wi_right_transformer_start_idx = (Wo_ / 2) * ConvStrideW_;
+        // Calculate last position in input for left tensor
+        const IndexType di_left_transformer_end_idx = (Do_ / 2 - 1) * ConvStrideD_ + z_eff;
+        const IndexType hi_left_transformer_end_idx = (Ho_ / 2 - 1) * ConvStrideH_ + y_eff;
+        const IndexType wi_left_transformer_end_idx = (Wo_ / 2 - 1) * ConvStrideW_ + x_eff;
+        // Allow to split if whole left padding will be in left tensor and right padding in right
+        // tensor
+        const bool is_possible_to_split_d = Do_ != 1 &&
+                                            di_right_transformer_start_idx > InLeftPadD_ &&
+                                            di_left_transformer_end_idx <= (InLeftPadD_ + Di_);
+        const bool is_possible_to_split_h = Ho_ != 1 &&
+                                            hi_right_transformer_start_idx > InLeftPadH_ &&
+                                            hi_left_transformer_end_idx <= (InLeftPadH_ + Hi_);
+        const bool is_possible_to_split_w = Wo_ != 1 &&
+                                            wi_right_transformer_start_idx > InLeftPadW_ &&
+                                            wi_left_transformer_end_idx <= (InLeftPadW_ + Wi_);
+
+        if(is_possible_to_split_d)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Do_  = Do_ / 2;
+            conv_to_gemm_transformer_right.Do_ = Do_ - Do_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadD_  = InLeftPadD_;
+            conv_to_gemm_transformer_right.InLeftPadD_ = 0;
+            // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadD_  = 0;
+            conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Di_ = di_left_transformer_end_idx - InLeftPadD_;
+            conv_to_gemm_transformer_right.Di_ =
+                math::min(Di_ - (di_right_transformer_start_idx - InLeftPadD_),
+                          (conv_to_gemm_transformer_right.Do_ - 1) * ConvStrideD_ + z_eff);
+            ;
+            // Calcualte offsets
+            a_right_offset = ((Do_ / 2) * ConvStrideD_ - InLeftPadD_) * DiStride_;
+            c_right_offset = (Do_ / 2) * DoStride_;
+        }
+        else if(is_possible_to_split_h)
+        {
+            conv_to_gemm_transformer_left.Ho_  = Ho_ / 2;
+            conv_to_gemm_transformer_right.Ho_ = Ho_ - Ho_ / 2;
+
+            conv_to_gemm_transformer_left.InLeftPadH_  = InLeftPadH_;
+            conv_to_gemm_transformer_right.InLeftPadH_ = 0;
+
+            conv_to_gemm_transformer_left.InRightPadH_  = 0;
+            conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_;
+
+            conv_to_gemm_transformer_left.Hi_ = hi_left_transformer_end_idx - InLeftPadH_;
+            conv_to_gemm_transformer_right.Hi_ =
+                math::min(Hi_ - (hi_right_transformer_start_idx - InLeftPadH_),
+                          (conv_to_gemm_transformer_right.Ho_ - 1) * ConvStrideH_ + y_eff);
+            a_right_offset = ((Ho_ / 2) * ConvStrideH_ - InLeftPadH_) * HiStride_;
+            c_right_offset = (Ho_ / 2) * HoStride_;
+        }
+        else if(is_possible_to_split_w)
+        {
+            conv_to_gemm_transformer_left.Wo_  = Wo_ / 2;
+            conv_to_gemm_transformer_right.Wo_ = Wo_ - Wo_ / 2;
+
+            conv_to_gemm_transformer_left.InLeftPadW_  = InLeftPadW_;
+            conv_to_gemm_transformer_right.InLeftPadW_ = 0;
+
+            conv_to_gemm_transformer_left.InRightPadW_  = 0;
+            conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_;
+
+            conv_to_gemm_transformer_left.Wi_ = wi_left_transformer_end_idx - InLeftPadW_;
+            conv_to_gemm_transformer_right.Wi_ =
+                math::min(Wi_ - (wi_right_transformer_start_idx - InLeftPadW_),
+                          (conv_to_gemm_transformer_right.Wo_ - 1) * ConvStrideW_ + x_eff);
+
+            a_right_offset = ((Wo_ / 2) * ConvStrideW_ - InLeftPadW_) * WiStride_;
+            c_right_offset = (Wo_ / 2) * WoStride_;
+        }
+        // Return left transform, right transformer, right offset to Input and right offset to
+        // Output
+        return ck_tile::make_tuple(conv_to_gemm_transformer_left,
+                              conv_to_gemm_transformer_right,
+                              a_grid_ptr_base + a_right_offset,
+                              c_grid_ptr_base + c_right_offset);
+    }
+#endif
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST auto make_out_grid_desc() const
+    {
+        // NWGK
+        const index_t NStride  = Wo_ * G_ * K_;
+        const index_t WoStride = G_ * K_;
+        constexpr auto KStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+
+        return make_naive_tensor_descriptor(make_tuple(N_, Wo_, K_),
+                                            make_tuple(NStride, WoStride, KStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST auto make_wei_grid_desc() const
+    {
+        // GKXC
+        return make_naive_tensor_descriptor_packed(make_tuple(K_, X_, C_));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST auto make_in_grid_desc() const
+    {
+        // NWGC
+        const index_t NStride  = Wi_ * G_ * C_;
+        const index_t WiStride = G_ * C_; // GC?
+        constexpr auto CStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(make_tuple(N_, Wi_, C_),
+                                            make_tuple(NStride, WiStride, CStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST auto make_out_grid_desc() const
+    {
+        // NHWGK
+        const index_t NStride  = Ho_ * Wo_ * G_ * K_;
+        const index_t HoStride = Wo_ * G_ * K_;
+        const index_t WoStride = G_ * K_;
+        constexpr auto KStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+
+        return make_naive_tensor_descriptor(make_tuple(N_, Ho_, Wo_, K_),
+                                            make_tuple(NStride, HoStride, WoStride, KStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST auto make_in_grid_desc() const
+    {
+        // NHWGC
+        const index_t NStride  = Hi_ * Wi_ * G_ * C_;
+        const index_t HiStride = Wi_ * G_ * C_;
+        const index_t WiStride = G_ * C_;
+        constexpr auto CStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(make_tuple(N_, Hi_, Wi_, C_),
+                                            make_tuple(NStride, HiStride, WiStride, CStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST auto make_wei_grid_desc() const
+    {
+        // GKYXC
+        return make_naive_tensor_descriptor_packed(make_tuple(K_, Y_, X_, C_));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST auto make_out_grid_desc() const
+    {
+        // NDHWGK
+        const index_t NStride  = Do_ * Ho_ * Wo_ * G_ * K_;
+        const index_t DoStride = Ho_ * Wo_ * G_ * K_;
+        const index_t HoStride = Wo_ * G_ * K_;
+        const index_t WoStride = G_ * K_;
+        constexpr auto KStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(
+            make_tuple(N_, Do_, Ho_, Wo_, K_),
+            make_tuple(NStride, DoStride, HoStride, WoStride, KStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST auto make_in_grid_desc() const
+    {
+        const index_t NStride  = Di_ * Hi_ * Wi_ * G_ * C_;
+        const index_t DiStride = Hi_ * Wi_ * G_ * C_;
+        const index_t HiStride = Wi_ * G_ * C_;
+        const index_t WiStride = G_ * C_;
+        constexpr auto CStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(
+            make_tuple(N_, Di_, Hi_, Wi_, C_),
+            make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST auto make_wei_grid_desc() const
+    {
+        // GKZYXC
+        return make_naive_tensor_descriptor_packed(make_tuple(K_, Z_, Y_, X_, C_));
+    }
+    // TODO: implement ck_tile::tensor_layout::convolution that describe packed/strided dimemsion as
+    // properties
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N([[maybe_unused]] const index_t GemmKBatch) const
+    {
+        // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+        const auto IWTildeSliceBegin = integer_divide_floor(
+            max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_);
+
+        const auto IWTildeSliceEnd =
+            min(WTilde_, integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1);
+
+        const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+        // GemmK is different for each GEMM
+        const auto XDotSlice = integer_divide_ceil(X_ - IdxXTilde_, XTilde_);
+
+        const auto out_grid_desc = make_out_grid_desc<NDimSpatial>();
+        const auto in_grid_desc  = make_in_grid_desc<NDimSpatial>();
+        const auto wei_grid_desc = make_wei_grid_desc<NDimSpatial>();
+
+        // A: output tensor comes in K_M
+        const auto out_n_wop_k_grid_desc =
+            transform_tensor_descriptor(out_grid_desc,
+                                        make_tuple(make_pass_through_transform(N_),
+                                                   make_pad_transform(Wo_, I0, I0),
+                                                   make_pass_through_transform(K_)),
+                                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+        const auto out_n_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+            out_n_wop_k_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_embed_transform(make_tuple(XDot_, WTilde_),
+                                            make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)),
+                       make_pass_through_transform(K_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+        const auto out_n_xdotslice_wtildeslice_k_grid_desc = transform_tensor_descriptor(
+            out_n_xdot_wtilde_k_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_slice_transform(XDot_, I0, XDotSlice),
+                       make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                       make_pass_through_transform(K_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+        const auto out_gemmm_gemmkraw_grid_desc = transform_tensor_descriptor(
+            out_n_xdotslice_wtildeslice_k_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(XDotSlice, K_)),
+                       make_merge_transform(make_tuple(N_, WTildeSlice))),
+            make_tuple(sequence<1, 3>{}, sequence<0, 2>{}),
+            make_tuple(sequence<1>{}, sequence<0>{}));
+
+        // B: weight tensor comes in K_N
+        const auto wei_k_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+            wei_grid_desc,
+            make_tuple(make_pass_through_transform(K_),
+                       make_embed_transform(make_tuple(XDot_, XTilde_),
+                                            make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+        const auto wei_k_xdotslice_c_grid_desc = transform_tensor_descriptor(
+            wei_k_xdot_xtilde_c_grid_desc,
+            make_tuple(make_pass_through_transform(K_),
+                       make_slice_transform(XDot_, I0, XDotSlice),
+                       make_freeze_transform(IdxXTilde_),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<>{}, sequence<2>{}));
+
+        const auto wei_gemmn_gemmkraw_grid_desc =
+            transform_tensor_descriptor(wei_k_xdotslice_c_grid_desc,
+                                        make_tuple(make_merge_transform(make_tuple(XDotSlice, K_)),
+                                                   make_pass_through_transform(C_)),
+                                        make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                                        make_tuple(sequence<1>{}, sequence<0>{}));
+
+        // c: input
+        const auto in_n_wip_c_grid_desc = transform_tensor_descriptor(
+            in_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+        const auto in_n_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+            in_n_wip_c_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_embed_transform(make_tuple(XTilde_, WTilde_),
+                                            make_tuple(ConvDilationW_, ConvStrideW_)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+        const auto in_n_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+            in_n_xtilde_wtilde_c_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_freeze_transform(IdxXTilde_),
+                       make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<>{}, sequence<1>{}, sequence<2>{}));
+
+        const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
+            in_n_wtildeslice_c_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(N_, WTildeSlice)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return make_tuple(out_gemmm_gemmkraw_grid_desc,
+                          wei_gemmn_gemmkraw_grid_desc,
+                          in_gemmmraw_gemmnraw_grid_desc);
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N([[maybe_unused]] const index_t GemmKBatch) const
+    {
+        // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+        const auto IHTildeSliceBegin = integer_divide_floor(
+            max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_);
+        const auto IWTildeSliceBegin = integer_divide_floor(
+            max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_);
+
+        const auto IHTildeSliceEnd =
+            min(HTilde_, integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1);
+        const auto IWTildeSliceEnd =
+            min(WTilde_, integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1);
+
+        const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+        const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+        // GemmK is different for each GEMM
+        const auto YDotSlice = integer_divide_ceil(Y_ - IdxYTilde_, YTilde_);
+        const auto XDotSlice = integer_divide_ceil(X_ - IdxXTilde_, XTilde_);
+
+        const auto out_grid_desc = make_out_grid_desc<NDimSpatial>();
+        const auto in_grid_desc  = make_in_grid_desc<NDimSpatial>();
+        const auto wei_grid_desc = make_wei_grid_desc<NDimSpatial>();
+
+        // A: output tensor comes in K_M
+        const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+            out_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_pad_transform(Ho_, I0, I0),
+                       make_pad_transform(Wo_, I0, I0),
+                       make_pass_through_transform(K_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+        const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+            out_n_hop_wop_k_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_embed_transform(make_tuple(YDot_, HTilde_),
+                                            make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)),
+                       make_embed_transform(make_tuple(XDot_, WTilde_),
+                                            make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)),
+                       make_pass_through_transform(K_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}, sequence<5>{}));
+
+        const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc =
+            transform_tensor_descriptor(
+                out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
+                make_tuple(make_pass_through_transform(N_),
+                           make_slice_transform(YDot_, I0, YDotSlice),
+                           make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                           make_slice_transform(XDot_, I0, XDotSlice),
+                           make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                           make_pass_through_transform(K_)),
+                make_tuple(sequence<0>{},
+                           sequence<1>{},
+                           sequence<2>{},
+                           sequence<3>{},
+                           sequence<4>{},
+                           sequence<5>{}),
+                make_tuple(sequence<0>{},
+                           sequence<1>{},
+                           sequence<2>{},
+                           sequence<3>{},
+                           sequence<4>{},
+                           sequence<5>{}));
+
+        const auto out_gemmm_gemmkraw_grid_desc = transform_tensor_descriptor(
+            out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)),
+                       make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice))),
+            make_tuple(sequence<1, 3, 5>{}, sequence<0, 2, 4>{}),
+            make_tuple(sequence<1>{}, sequence<0>{}));
+
+        // B: weight tensor comes in K_N
+        const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+            wei_grid_desc,
+            make_tuple(make_pass_through_transform(K_),
+                       make_embed_transform(make_tuple(YDot_, YTilde_),
+                                            make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)),
+                       make_embed_transform(make_tuple(XDot_, XTilde_),
+                                            make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}, sequence<5>{}));
+
+        const auto wei_k_ydotslice_xdotslice_c_grid_desc =
+            transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                        make_tuple(make_pass_through_transform(K_),
+                                                   make_slice_transform(YDot_, I0, YDotSlice),
+                                                   make_slice_transform(XDot_, I0, XDotSlice),
+                                                   make_freeze_transform(IdxYTilde_),
+                                                   make_freeze_transform(IdxXTilde_),
+                                                   make_pass_through_transform(C_)),
+                                        make_tuple(sequence<0>{},
+                                                   sequence<1>{},
+                                                   sequence<3>{},
+                                                   sequence<2>{},
+                                                   sequence<4>{},
+                                                   sequence<5>{}),
+                                        make_tuple(sequence<0>{},
+                                                   sequence<1>{},
+                                                   sequence<2>{},
+                                                   sequence<>{},
+                                                   sequence<>{},
+                                                   sequence<3>{}));
+
+        const auto wei_gemmn_gemmkraw_grid_desc = transform_tensor_descriptor(
+            wei_k_ydotslice_xdotslice_c_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K_)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<1, 2, 0>{}, sequence<3>{}),
+            make_tuple(sequence<1>{}, sequence<0>{}));
+
+        // c: input
+        const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+            in_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                       make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+        const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+            in_n_hip_wip_c_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_embed_transform(make_tuple(YTilde_, HTilde_),
+                                            make_tuple(ConvDilationH_, ConvStrideH_)),
+                       make_embed_transform(make_tuple(XTilde_, WTilde_),
+                                            make_tuple(ConvDilationW_, ConvStrideW_)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}, sequence<5>{}));
+
+        const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+            in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_freeze_transform(IdxYTilde_),
+                       make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                       make_freeze_transform(IdxXTilde_),
+                       make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{},
+                       sequence<1>{},
+                       sequence<2>{},
+                       sequence<3>{},
+                       sequence<4>{},
+                       sequence<5>{}),
+            make_tuple(sequence<0>{},
+                       sequence<>{},
+                       sequence<1>{},
+                       sequence<>{},
+                       sequence<2>{},
+                       sequence<3>{}));
+
+        const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
+            in_n_htildeslice_wtildeslice_c_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(N_, HTildeSlice, WTildeSlice)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0, 1, 2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return make_tuple(out_gemmm_gemmkraw_grid_desc,
+                          wei_gemmn_gemmkraw_grid_desc,
+                          in_gemmmraw_gemmnraw_grid_desc);
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N([[maybe_unused]] const index_t GemmKBatch) const
+    {
+        // only work on DTilde, HTilde and WTilde that contribute to non-padding area of input
+        // tensor
+        const auto IDTildeSliceBegin = integer_divide_floor(
+            max(I0, InLeftPadD_ - ConvDilationD_ * (ZTilde_ - I1)), ConvStrideD_);
+        const auto IHTildeSliceBegin = integer_divide_floor(
+            max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_);
+        const auto IWTildeSliceBegin = integer_divide_floor(
+            max(I0, InLeftPadW_ - ConvDilationW_ * (XTilde_ - I1)), ConvStrideW_);
+
+        const auto IDTildeSliceEnd =
+            min(DTilde_, integer_divide_ceil(InLeftPadD_ + Di_ - I1, ConvStrideD_) + I1);
+        const auto IHTildeSliceEnd =
+            min(HTilde_, integer_divide_ceil(InLeftPadH_ + Hi_ - I1, ConvStrideH_) + I1);
+        const auto IWTildeSliceEnd =
+            min(WTilde_, integer_divide_ceil(InLeftPadW_ + Wi_ - I1, ConvStrideW_) + I1);
+
+        const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin;
+        const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+        const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+        // GemmK is different for each GEMM
+        const auto ZDotSlice = integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_);
+        const auto YDotSlice = integer_divide_ceil(Y_ - IdxYTilde_, YTilde_);
+        const auto XDotSlice = integer_divide_ceil(X_ - IdxXTilde_, XTilde_);
+
+        const auto out_grid_desc = make_out_grid_desc<NDimSpatial>();
+        const auto in_grid_desc  = make_in_grid_desc<NDimSpatial>();
+        const auto wei_grid_desc = make_wei_grid_desc<NDimSpatial>();
+
+        // A: output tensor comes in K_M
+        const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+            out_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_pad_transform(Do_, I0, I0),
+                       make_pad_transform(Ho_, I0, I0),
+                       make_pad_transform(Wo_, I0, I0),
+                       make_pass_through_transform(K_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}));
+
+        const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+            out_n_hop_wop_k_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_embed_transform(make_tuple(ZDot_, DTilde_),
+                                            make_tuple(-ConvDilationD_ / GcdStrideDilationD_, I1)),
+                       make_embed_transform(make_tuple(YDot_, HTilde_),
+                                            make_tuple(-ConvDilationH_ / GcdStrideDilationH_, I1)),
+                       make_embed_transform(make_tuple(XDot_, WTilde_),
+                                            make_tuple(-ConvDilationW_ / GcdStrideDilationW_, I1)),
+                       make_pass_through_transform(K_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+            make_tuple(sequence<0>{},
+                       sequence<1, 2>{},
+                       sequence<3, 4>{},
+                       sequence<5, 6>{},
+                       sequence<7>{}));
+
+        const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc =
+            transform_tensor_descriptor(
+                out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
+                make_tuple(make_pass_through_transform(N_),
+                           make_slice_transform(ZDot_, I0, ZDotSlice),
+                           make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice),
+                           make_slice_transform(YDot_, I0, YDotSlice),
+                           make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                           make_slice_transform(XDot_, I0, XDotSlice),
+                           make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                           make_pass_through_transform(K_)),
+                make_tuple(sequence<0>{},
+                           sequence<1>{},
+                           sequence<2>{},
+                           sequence<3>{},
+                           sequence<4>{},
+                           sequence<5>{},
+                           sequence<6>{},
+                           sequence<7>{}),
+                make_tuple(sequence<0>{},
+                           sequence<1>{},
+                           sequence<2>{},
+                           sequence<3>{},
+                           sequence<4>{},
+                           sequence<5>{},
+                           sequence<6>{},
+                           sequence<7>{}));
+
+        const auto out_gemmm_gemmkraw_grid_desc = transform_tensor_descriptor(
+            out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)),
+                       make_merge_transform(make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice))),
+            make_tuple(sequence<1, 3, 5, 7>{}, sequence<0, 2, 4, 6>{}),
+            make_tuple(sequence<1>{}, sequence<0>{}));
+
+        // B: weight tensor comes in K_N
+        const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+            wei_grid_desc,
+            make_tuple(make_pass_through_transform(K_),
+                       make_embed_transform(make_tuple(ZDot_, ZTilde_),
+                                            make_tuple(ConvStrideD_ / GcdStrideDilationD_, I1)),
+                       make_embed_transform(make_tuple(YDot_, YTilde_),
+                                            make_tuple(ConvStrideH_ / GcdStrideDilationH_, I1)),
+                       make_embed_transform(make_tuple(XDot_, XTilde_),
+                                            make_tuple(ConvStrideW_ / GcdStrideDilationW_, I1)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+            make_tuple(sequence<0>{},
+                       sequence<1, 2>{},
+                       sequence<3, 4>{},
+                       sequence<5, 6>{},
+                       sequence<7>{}));
+
+        const auto wei_k_ydotslice_xdotslice_c_grid_desc =
+            transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                        make_tuple(make_pass_through_transform(K_),
+                                                   make_slice_transform(ZDot_, I0, ZDotSlice),
+                                                   make_slice_transform(YDot_, I0, YDotSlice),
+                                                   make_slice_transform(XDot_, I0, XDotSlice),
+                                                   make_freeze_transform(IdxZTilde_),
+                                                   make_freeze_transform(IdxYTilde_),
+                                                   make_freeze_transform(IdxXTilde_),
+                                                   make_pass_through_transform(C_)),
+                                        make_tuple(sequence<0>{},
+                                                   sequence<1>{},
+                                                   sequence<3>{},
+                                                   sequence<5>{},
+                                                   sequence<2>{},
+                                                   sequence<4>{},
+                                                   sequence<6>{},
+                                                   sequence<7>{}),
+                                        make_tuple(sequence<0>{},
+                                                   sequence<1>{},
+                                                   sequence<2>{},
+                                                   sequence<3>{},
+                                                   sequence<>{},
+                                                   sequence<>{},
+                                                   sequence<>{},
+                                                   sequence<4>{}));
+
+        const auto wei_gemmn_gemmkraw_grid_desc = transform_tensor_descriptor(
+            wei_k_ydotslice_xdotslice_c_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K_)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<1, 2, 3, 0>{}, sequence<4>{}),
+            make_tuple(sequence<1>{}, sequence<0>{}));
+
+        // c: input
+        const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+            in_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_pad_transform(Di_, InLeftPadD_, InRightPadD_),
+                       make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                       make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}));
+
+        const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+            in_n_hip_wip_c_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_embed_transform(make_tuple(ZTilde_, DTilde_),
+                                            make_tuple(ConvDilationD_, ConvStrideD_)),
+                       make_embed_transform(make_tuple(YTilde_, HTilde_),
+                                            make_tuple(ConvDilationH_, ConvStrideH_)),
+                       make_embed_transform(make_tuple(XTilde_, WTilde_),
+                                            make_tuple(ConvDilationW_, ConvStrideW_)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+            make_tuple(sequence<0>{},
+                       sequence<1, 2>{},
+                       sequence<3, 4>{},
+                       sequence<5, 6>{},
+                       sequence<7>{}));
+
+        const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+            in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_freeze_transform(IdxZTilde_),
+                       make_slice_transform(DTilde_, IDTildeSliceBegin, DTildeSlice),
+                       make_freeze_transform(IdxYTilde_),
+                       make_slice_transform(HTilde_, IHTildeSliceBegin, HTildeSlice),
+                       make_freeze_transform(IdxXTilde_),
+                       make_slice_transform(WTilde_, IWTildeSliceBegin, WTildeSlice),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{},
+                       sequence<1>{},
+                       sequence<2>{},
+                       sequence<3>{},
+                       sequence<4>{},
+                       sequence<5>{},
+                       sequence<6>{},
+                       sequence<7>{}),
+            make_tuple(sequence<0>{},
+                       sequence<>{},
+                       sequence<1>{},
+                       sequence<>{},
+                       sequence<2>{},
+                       sequence<>{},
+                       sequence<3>{},
+                       sequence<4>{}));
+
+        const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
+            in_n_htildeslice_wtildeslice_c_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(N_, DTildeSlice, HTildeSlice, WTildeSlice)),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0, 1, 2, 3>{}, sequence<4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return make_tuple(out_gemmm_gemmkraw_grid_desc,
+                          wei_gemmn_gemmkraw_grid_desc,
+                          in_gemmmraw_gemmnraw_grid_desc);
+    }
+
+    IndexType G_, N_;
+    IndexType Di_, Hi_, Wi_;
+    IndexType Do_, Ho_, Wo_;
+    IndexType Z_, Y_, X_;
+    IndexType K_, C_;
+    IndexType ConvStrideD_, ConvStrideH_, ConvStrideW_;
+    IndexType ConvDilationD_, ConvDilationH_, ConvDilationW_;
+    IndexType InLeftPadD_, InLeftPadH_, InLeftPadW_;
+    IndexType InRightPadD_, InRightPadH_, InRightPadW_;
+    IndexType IdxZTilde_, IdxYTilde_, IdxXTilde_;
+    IndexType GcdStrideDilationD_, GcdStrideDilationH_, GcdStrideDilationW_;
+    IndexType ZTilde_, YTilde_, XTilde_;
+    IndexType DTilde_, HTilde_, WTilde_;
+    IndexType ZDot_, YDot_, XDot_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp
index ee74f1588f..eb54807d88 100644
--- a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp
+++ b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -31,6 +31,7 @@ struct ImageToColumn
 
     static constexpr index_t kMPerBlock = Problem::BlockShape::kMPerBlock;
     static constexpr index_t kKPerBlock = Problem::BlockShape::kKPerBlock;
+    static constexpr index_t kBlockSize = Problem::BlockShape::kBlockSize;
 
     struct Kargs
     {
diff --git a/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp b/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp
index ad513dbd11..05490ac3ed 100644
--- a/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp
+++ b/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp
@@ -14,11 +14,10 @@ struct TileImageToColumnShape
     static constexpr index_t kMPerThread = ThreadTile::at(number<0>{});
     static constexpr index_t kKPerThread = ThreadTile::at(number<1>{});
 
-    static constexpr index_t kMPerWarp = WarpTile::at(number<0>{});
-    static constexpr index_t kKPerWarp = WarpTile::at(number<1>{});
-
+    static constexpr index_t kMPerWarp       = WarpTile::at(number<0>{});
     static constexpr index_t kMThreadPerWarp = kMPerWarp / kMPerThread;
-    static constexpr index_t kKThreadPerWarp = kKPerWarp / kKPerThread;
+    static constexpr index_t kKThreadPerWarp = get_warp_size() / kMThreadPerWarp;
+    static constexpr index_t kKPerWarp       = kKPerThread * kKThreadPerWarp;
 
     static constexpr index_t kMPerBlock = BlockTile::at(number<0>{});
     static constexpr index_t kKPerBlock = BlockTile::at(number<1>{});
diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
index 146ac40fb7..6998b358d8 100644
--- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
@@ -76,9 +76,9 @@ struct Layernorm2dFwd
     static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
     static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
     static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
-
-    static constexpr auto I0 = number<0>{};
-    static constexpr auto I1 = number<1>{};
+    static constexpr index_t kBlockSize      = Problem::BlockShape::BlockSize;
+    static constexpr auto I0                 = number<0>{};
+    static constexpr auto I1                 = number<1>{};
 
     struct Kargs
     {
diff --git a/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp b/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp
index 1c5cc4a11a..3578e3b375 100644
--- a/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp
+++ b/include/ck_tile/ops/permute/kernel/generic_permute_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp
index 042e0b98c2..a6721c9305 100644
--- a/include/ck_tile/ops/reduce.hpp
+++ b/include/ck_tile/ops/reduce.hpp
@@ -6,10 +6,10 @@
 #include "ck_tile/ops/reduce/block/block_reduce.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
-#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
-#include "ck_tile/ops/common/tensor_layout.hpp"
-#include "ck_tile/ops/common/utils.hpp"
 #include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp"
 #include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
 #include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp"
 #include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index 434be9f84a..7a10d1fa56 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -14,10 +14,14 @@ namespace ck_tile {
  * Y dim must have at least one dim not been reduced
  */
 // synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
-template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
+template <typename AccDistributedTensor_,
+          typename ReduceFunc,
+          bool WithBroadcast = true,
+          bool CrossWarp     = true>
 CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
                                            const ReduceFunc& reduce_func,
-                                           bool_constant<WithBroadcast> = {})
+                                           bool_constant<WithBroadcast> = {},
+                                           bool_constant<CrossWarp>     = {})
 {
     using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
     using DstrEncode       = typename Dstr::DstrEncode;
@@ -56,14 +60,24 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
 
                 // reduction sweep forward
                 static_for<0, nstage, 1>{}([&](auto istage) {
-                    constexpr index_t lid_delta =
-                        lid_over_rid_derivative * (1 << (nstage - istage - 1));
+                    if constexpr(CrossWarp)
+                    {
+                        constexpr index_t lid_delta =
+                            lid_over_rid_derivative * (1 << (nstage - istage - 1));
 
-                    // pull data from remote lane
-                    const auto v_remote = warp_shuffle_down(v_local, lid_delta);
+                        // pull data from remote lane
+                        const auto v_remote = warp_shuffle_down(v_local, lid_delta);
 
-                    // reduce
-                    v_local = reduce_func(v_local, v_remote);
+                        // reduce
+                        v_local = reduce_func(v_local, v_remote);
+                    }
+                    else
+                    {
+                        // pull data from remote lane
+                        const auto v_swapped_regs = warp_shuffle_down_pair(v_local);
+                        // reduce
+                        v_local = reduce_func(v_swapped_regs.at(0), v_swapped_regs.at(1));
+                    }
                 });
             }
         });
diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
index 0cae4023b7..5755f38475 100644
--- a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
+++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
@@ -25,6 +25,8 @@ struct Reduce
     using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
     using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
 
+    static constexpr index_t kBlockSize = Problem::BlockShape::BlockSize;
+
     private:
     // Helper function to calculate optimal vector size for input tensor
     template <typename InputShape, typename ReduceDims>
diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
index 6cb81b8856..e7f4ce0ba8 100644
--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
@@ -70,6 +70,7 @@ struct Rmsnorm2dFwd
     static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
     static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
     static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+    static constexpr index_t kBlockSize      = Problem::BlockShape::BlockSize;
 
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
diff --git a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
index cb934c6c52..b70e996617 100644
--- a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
@@ -48,6 +48,7 @@ struct MoeSmoothquant
     static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
     static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
     static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+    static constexpr index_t kBlockSize      = Problem::BlockShape::BlockSize;
 
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
diff --git a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
index 540fddd2e8..7dc913901e 100644
--- a/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
@@ -45,6 +45,7 @@ struct Smoothquant
     static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
     static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
     static constexpr index_t Repeat_N        = Problem::BlockShape::Repeat_N;
+    static constexpr index_t kBlockSize      = Problem::BlockShape::BlockSize;
 
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
diff --git a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
index b8520ae61a..277049f6b0 100644
--- a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
+++ b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -34,6 +34,8 @@ struct TopkSoftmaxKernel
     using WeightType = typename Problem::WeightType;
     using IndexType  = typename Problem::IndexType;
 
+    static constexpr index_t kBlockSize = Problem::BlockSize;
+
     struct TopkSoftmaxKargs
     {
         const void* p_input;
diff --git a/include/ck_tile/ref/naive_attention.hpp b/include/ck_tile/ref/naive_attention.hpp
index 172fcee2e3..50e963bd72 100644
--- a/include/ck_tile/ref/naive_attention.hpp
+++ b/include/ck_tile/ref/naive_attention.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -117,7 +117,7 @@ struct naive_attention_fwd_kernel
         std::is_same_v<KType, fp8_t> && std::is_same_v<VType, fp8_t>;
 
     static constexpr int v_per_token_quant_group_size = 64;
-
+    static constexpr int kBlockSize                   = 256;
     // TODO: hardcode
     using SoftmaxType      = float; // always using float to do softmax compute
     using QuantComputeType = float; // used for quant/dequant scale compute
@@ -254,7 +254,7 @@ struct naive_attention_fwd_kernel
         __device__ T load(int i_s, int i_h, int i_d) { return base_ptr[get_offset(i_s, i_h, i_d)]; }
     };
 
-    __device__ __host__ static constexpr int get_block_size() { return 256; }
+    __device__ __host__ static constexpr int get_block_size() { return kBlockSize; }
 
     // for simpliciy, 1 WG always compute 1 token along q, compute all token along kv
     // compute all hdim from q, compute WG_SIZE hdim from v
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
index 3fa82ae53a..e78ef7b803 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
@@ -17,6 +17,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
+// Row, Col
 void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
@@ -88,6 +89,152 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_
                                                             PassThrough,
                                                             PassThrough,
                                                             PassThrough>>>& instances);
+
+// Row, Row
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+// Col, Row
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
 #endif
 
 template <typename A0DataType,
@@ -154,6 +301,32 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                 add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances(
                     op_ptrs);
             }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_kpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_kpadding_instances(
+                    op_ptrs);
+            }
         }
 #endif
         return op_ptrs;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt
index d572862884..4f3c2f1ff5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt
@@ -2,15 +2,37 @@
 set(GEMM_AB_SCALE_INSTANCES)
 
 list(APPEND GEMM_AB_SCALE_INSTANCES 
+        # Row, Col
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
+        # Row, Row
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
+        # Col, Row
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
         )
 
+# Row, Col
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+# Row, Row
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+# Col, Row
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 add_instance_library(device_gemm_ab_scale_instance ${GEMM_AB_SCALE_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp
new file mode 100644
index 0000000000..353e3db0f9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout|       DsLayout| ELayout|      AData|      BData|     DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block| Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |               |        |       Type|       Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //################################|        |        |               |        |           |           |           |      |        |         |   Operation|   Operation|      Operation|              |      |     M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //################################|        |        |               |        |           |           |           |      |        |         |            |            |               |              |      |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,   4,   4,  32,   32,    2,    2,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,   S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,   4,   4,  16,   16,    4,    4,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,   S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,   4,   4,  32,   32,    2,    2,     S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,   S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,   S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,    64,   128,   4,   4,  32,   32,    2,    1,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,   128,   128,   4,   4,  32,   32,    1,    2,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,    64,   128,   4,   4,  32,   32,    1,    1,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|      DsLayout| ELayout|AData    |     BData|      DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |              |        | Type    |      Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size|  Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //################################|        |        |              |        |         |          |            |      |        |         |   Operation|   Operation|       Operation|              |      |      M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //################################|        |        |              |        |         |          |            |      |        |         |            |            |                |              |      |       |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Memory friendly 
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   256,   128,  4,   4,  16,   16,    1,    4,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   128,  4,   4,  16,   16,    1,    2,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   128,  4,   4,  16,   16,    1,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              2,              2,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        // FIXME: KPerBlock=256 give numerically bad results:
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   256,  4,   4,  16,   16,    1,    2,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              4,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   256,  4,   4,  16,   16,    1,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   256,   128,  4,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   128,  4,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   128,  4,   4,  16,   16,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              2,              2,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   256,   128,  4,   4,  32,   32,    1,    2,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,    S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   128,  4,   4,  32,   32,    1,    1,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,    S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   128,  4,   4,  16,   16,    2,    1,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,    S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              2,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        // FIXME: KPerBlock=256 give numerically bad results:
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   256,  4,   4,  32,   32,    1,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,             16,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   256,  4,   4,  16,   16,    2,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,             16,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   256,   128,  4,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   128,  4,   4,  32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   128,  4,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              2,              2,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
+
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   256,  4,   4,  32,   32,    2,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,             16,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   256,  4,   4,  32,   32,    1,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,             16,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp
new file mode 100644
index 0000000000..b1d5443c49
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..4d72edf910
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..fbb35d6bec
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..b90c48c7af
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp
new file mode 100644
index 0000000000..9d846354bf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout|       DsLayout| ELayout|      AData|      BData|     DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block| Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |               |        |       Type|       Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //################################|        |        |               |        |           |           |           |      |        |         |   Operation|   Operation|      Operation|              |      |     M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //################################|        |        |               |        |           |           |           |      |        |         |            |            |               |              |      |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,    64,   128,  16,   4,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              8,             4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,   128,   128,  16,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              8,             4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,    64,   128,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              8,             4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|      DsLayout| ELayout|AData    |     BData|      DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |              |        | Type    |      Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size|  Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //################################|        |        |              |        |         |          |            |      |        |         |   Operation|   Operation|       Operation|              |      |      M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //################################|        |        |              |        |         |          |            |      |        |         |            |            |                |              |      |       |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Memory friendly 
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   256,   128,  8,   4,  16,   16,    1,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   128,  8,   4,  16,   16,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   128,  8,   4,  16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              2,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   256, 16,  16,  16,   16,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   256, 16,  16,  16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   256,   128, 16,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<1, 0, 2>,              1,              8,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   128, 16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<1, 0, 2>,              1,              4,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   128, 16,   4,  16,   16,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<1, 0, 2>,              1,              2,              2,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   256, 16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   256, 16,  16,  16,   16,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   256,   128, 16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   128, 16,   4,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   128, 16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              2,              2,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
+
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   256, 16,  16,  32,   32,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   256, 16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp
new file mode 100644
index 0000000000..b249fd82d8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..772a4e730b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..8ffb38b115
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..edccd05931
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
index 27d7933477..c5ebd7d2f5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -74,6 +74,54 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_part2 = std::
 #endif
     // clang-format on
     >;
+// instances for double rate mfma on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   128,  32,  32,  32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>
+#endif
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
@@ -115,6 +163,42 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// instances for double rate mfma on gfx950
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    256, 32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    256, 32,  32,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    256, 32,  32,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    256, 32,  32,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    256, 32,  32,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    256, 32,  32,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 32,  32,  16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    256, 32,  32,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    256, 32,  32,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    256, 32,  32,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    256, 32,  32,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    256, 32,  32,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
index d6c9809020..6cf0228c04 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -17,7 +17,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
 
-    if(ck::get_device_name() != "gfx950")
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr<GemmDefault>{});
+    }
+    else
     {
         add_device_operation_instances(
             instances,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
index fc6ad01742..65e49d5f88 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -17,7 +17,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
 
-    if(ck::get_device_name() != "gfx950")
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr<GemmKPadding>{});
+    }
+    else
     {
         add_device_operation_instances(
             instances,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
index f6a9c48555..13c4ff682f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -16,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave, GemmDefault>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Intrawave,
+                                                                           GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
index f9c12e7cb2..49652b8680 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -16,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave, GemmKPadding>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Intrawave,
+                                                                           GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
index 1d33c7fa57..120dfe0bee 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -16,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave, GemmDefault>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Interwave,
+                                                                           GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
index 252aec5bc2..2dc5acaabb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -16,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave, GemmKPadding>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Interwave,
+                                                                           GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp b/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp
index a84ad5269b..d68a1065ab 100644
--- a/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,6 +19,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -74,6 +75,10 @@ bool profile_gemm_ab_scale_impl(int do_verification,
                                       ? ((K + ScaleBlockK - 1) / ScaleBlockK)
                                       : ((N + ScaleBlockN - 1) / ScaleBlockN);
 
+    ck::utils::validate_gemm_stride<ALayout>(M, K, StrideA, "StrideA");
+    ck::utils::validate_gemm_stride<BLayout>(K, N, StrideB, "StrideB");
+    ck::utils::validate_gemm_stride<BLayout>(M, N, StrideE, "StrideE");
+
     Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<A1DataType> a1_m_k(f_host_tensor_descriptor((M + ScaleBlockM - 1) / ScaleBlockM,
                                                        (K + ScaleBlockK - 1) / ScaleBlockK,
diff --git a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
index c0ffea8a32..405a2359c2 100644
--- a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -15,6 +15,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -93,6 +94,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
             }
         };
 
+    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+        M, N, K, StrideA, StrideB, StrideC);
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
 
diff --git a/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp b/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp
index 53073a6c75..32bdf05771 100644
--- a/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp
@@ -20,6 +20,7 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/validation_common.hpp"
 
 namespace ck {
 namespace profiler {
@@ -104,6 +105,10 @@ bool profile_gemm_blockscale_weighpreshuffle_impl(int do_verification,
                                       ? ((K + ScaleBlockK - 1) / ScaleBlockK)
                                       : ((N + ScaleBlockN - 1) / ScaleBlockN);
 
+    ck::utils::validate_gemm_stride<ALayout>(M, K, StrideA, "StrideA");
+    ck::utils::validate_gemm_stride<BLayout>(K, N, StrideB, "StrideB");
+    ck::utils::validate_gemm_stride<BLayout>(M, N, StrideE, "StrideE");
+
     Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<A1DataType> a1_m_k(f_host_tensor_descriptor((M + ScaleBlockM - 1) / ScaleBlockM,
                                                        (K + ScaleBlockK - 1) / ScaleBlockK,
diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp
index d2a38b2a81..fdcb3ad128 100644
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -24,6 +24,7 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/validation_common.hpp"
 
 namespace ck {
 namespace profiler {
@@ -64,6 +65,9 @@ int profile_gemm_impl(int do_verification,
             }
         };
 
+    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+        M, N, K, StrideA, StrideB, StrideC);
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
index ff801e8afd..a74d2a01d9 100644
--- a/profiler/include/profiler/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -15,6 +15,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -88,6 +89,9 @@ bool profile_gemm_reduce_impl(int do_verification,
             }
         };
 
+    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+        M, N, K, StrideA, StrideB, StrideC);
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
 
diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
index 5d5ae1ad15..0640e95aba 100644
--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,6 +19,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -62,6 +63,9 @@ bool profile_gemm_splitk_impl(int do_verification,
             }
         };
 
+    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+        M, N, K, StrideA, StrideB, StrideC);
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_streamk_impl.hpp
index 71b54c1f47..d24ee1c7ea 100644
--- a/profiler/include/profiler/profile_gemm_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_streamk_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,6 +19,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -59,6 +60,9 @@ bool profile_gemm_streamk_impl(int do_verification,
             }
         };
 
+    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+        M, N, K, StrideA, StrideB, StrideC);
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp
index ed62828158..feb75c9660 100644
--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -19,6 +19,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -63,6 +64,9 @@ bool profile_gemm_universal_impl(int do_verification,
             }
         };
 
+    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+        M, N, K, StrideA, StrideB, StrideC);
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp b/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp
index e218143857..271bc6ef59 100644
--- a/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp
@@ -19,6 +19,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -91,6 +92,9 @@ bool profile_gemm_universal_preshuffle_impl(int do_verification,
             }
         };
 
+    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+        M, N, K, StrideA, StrideB, StrideC);
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp
index d600de0978..a0ee6a6674 100644
--- a/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,6 +19,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 
 namespace ck {
@@ -64,6 +65,9 @@ bool profile_gemm_universal_reduce_impl(int do_verification,
             }
         };
 
+    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+        M, N, K, StrideA, StrideB, StrideC);
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
old mode 100755
new mode 100644
index 640b192baf..5c859b830d
--- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -21,6 +21,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/validation_common.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
 
@@ -67,6 +68,9 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
             }
         };
 
+    ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
+        M, N, K, StrideA, StrideB, StrideC);
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
diff --git a/profiler/src/profile_gemm_ab_scale.cpp b/profiler/src/profile_gemm_ab_scale.cpp
index 3956038a30..531872bbb9 100644
--- a/profiler/src/profile_gemm_ab_scale.cpp
+++ b/profiler/src/profile_gemm_ab_scale.cpp
@@ -173,6 +173,40 @@ int profile_gemm_ab_scale(int argc, char* argv[])
                        Col{},
                        Row{});
     }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN &&
+            scale_block_tile == ScaleBlockTile::Tile_1_128_128)
+    {
+        return profile(F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       BF16{},
+                       ck::Number<1>{},
+                       ck::Number<128>{},
+                       ck::Number<128>{},
+                       Row{},
+                       Row{},
+                       Row{});
+    }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::KM_KN_MN &&
+            scale_block_tile == ScaleBlockTile::Tile_1_128_128)
+    {
+        return profile(F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       BF16{},
+                       ck::Number<1>{},
+                       ck::Number<128>{},
+                       ck::Number<128>{},
+                       Col{},
+                       Row{},
+                       Row{});
+    }
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 25a1590808..b93555901e 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -1,44 +1,47 @@
 #!/bin/bash
-set -euo pipefail
-IFS=$'\n\t'
+# exit when a command exits with non-zero status; also when an unbound variable is referenced
+set -eu
+# pipefail is supported by many shells, not supported by sh and dash
+set -o pipefail 2>/dev/null | true
+# when treating a string as a sequence, do not split on spaces
+IFS=$(printf '\n\t')
 
-rm -f CMakeCache.txt
-rm -f *.cmake
-rm -rf CMakeFiles
+# clean the build system files
+find . -name CMakeFiles     -type d -exec rm -rfv {} +
+find . -name CMakeCache.txt -type f -exec rm -rv  {} +
 
-MY_PROJECT_SOURCE=$1
+if [ $# -ge 1 ]; then
+    MY_PROJECT_SOURCE="$1"
+    shift 1
+else
+    MY_PROJECT_SOURCE=".."
+fi
 
+GPU_TARGETS="gfx908;gfx90a;gfx942"
 
-if [ $# -ge 2 ]; then
-    case "$2" in
-        gfx*) 
-            GPU_TARGETS=$2
-            shift 2
+if [ $# -ge 1 ]; then
+    case "$1" in 
+        gfx*)
+            GPU_TARGETS=$1
+            shift 1
             echo "GPU targets provided: $GPU_TARGETS"
-            REST_ARGS=$@
             ;;
         *)
-            echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
-            GPU_TARGETS="gfx908;gfx90a;gfx942"
-            shift 1
-            REST_ARGS=$@
+            echo "No GPU targets provided, using default targets: $GPU_TARGETS"
             ;;
     esac
 else
-    echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
-    GPU_TARGETS="gfx908;gfx90a;gfx942"
-    shift 1
-    REST_ARGS=$@
+    echo "No GPU targets provided, using default targets: $GPU_TARGETS"
 fi
 
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm/                                                                   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++                                                  \
--D CMAKE_CXX_FLAGS="-std=c++20 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
+-D CMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"                    \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
-$REST_ARGS                                                                                        \
+$@                                                                                                \
 ${MY_PROJECT_SOURCE}
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
deleted file mode 100755
index 5263de92c8..0000000000
--- a/script/cmake-ck-release.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-IFS=$'\n\t'
-
-rm -f CMakeCache.txt
-rm -f *.cmake
-rm -rf CMakeFiles
-
-MY_PROJECT_SOURCE=$1
-
-if [ $# -ge 2 ] && [[ "$2" =~ ^gfx ]]; then
-    GPU_TARGETS=$2
-    shift 2
-    echo "GPU targets provided: $GPU_TARGETS"
-    REST_ARGS=$@
-else
-    echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
-    GPU_TARGETS="gfx908;gfx90a;gfx942"
-    shift 1
-    REST_ARGS=$@
-fi
-
-cmake                                                                                             \
--D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
--D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++                                                  \
--D CMAKE_CXX_FLAGS="-O3"                                                                          \
--D CMAKE_BUILD_TYPE=Release                                                                       \
--D BUILD_DEV=OFF                                                                                  \
--D GPU_TARGETS=$GPU_TARGETS                                                                       \
--D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
--D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
-$REST_ARGS                                                                                        \
-${MY_PROJECT_SOURCE}
-
diff --git a/script/dependency-parser/generate_list_of_files_not_referenced_in_tests.py b/script/dependency-parser/generate_list_of_files_not_referenced_in_tests.py
new file mode 100644
index 0000000000..7a15fee128
--- /dev/null
+++ b/script/dependency-parser/generate_list_of_files_not_referenced_in_tests.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+# This script generate list of files that are not referenced from any test (list in JSON format)
+# Script only looks at not referenced files from three directories: include, library and profiler
+# CK needs to be built with ability to use dependency parser and generate dependencies
+
+# Usage: python3 generate_list_of_files_not_referenced_in_tests -f /path/to/enhanced_dependency_mapping/json/file
+
+import argparse
+import subprocess
+import json
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-f",
+        required=True,
+        help="Path to enhanced_dependency_mapping.json file generated by dependency parser",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    with open(args.f, "r") as file:
+        ref_files = json.load(file)
+    file_to_executables = ref_files["file_to_executables"]
+
+    all_files = (
+        subprocess.check_output(
+            'find ../../include/ ../../library/ ../../profiler/ -type f -iname "*.cpp" -o -iname "*.hpp"',
+            shell=True,
+        )
+        .decode("utf-8")
+        .split("\n")
+    )
+    all_files = all_files[:-1]
+    all_files[:] = [x[6:] for x in all_files]
+
+    all_referenced_files = []
+    for v in file_to_executables:
+        if (
+            "composablekernel/include/" in v
+            or "composablekernel/library/" in v
+            or "composablekernel/profiler/" in v
+        ):
+            exe_list = file_to_executables[v]
+        else:
+            continue
+
+        found = any("bin/test_" in el for el in exe_list)
+        if found:
+            all_referenced_files.append(v)
+
+    not_referenced_files = {"include": [], "library": [], "profiler": []}
+    for f in all_files:
+        found = any(f in el for el in all_referenced_files)
+        if not found:
+            pos = f.find("/")
+            not_referenced_files[f[:pos]].append(f)
+
+    print(json.dumps(not_referenced_files, indent="\t"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
index 25b10e1dc4..dd90034064 100644
--- a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
@@ -66,5 +66,5 @@ float add_rmsnorm2d_rdquant_fwd_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 }
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
index f654d1a917..f634e508e3 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -111,7 +111,6 @@ class TestCkTileBatchedGemm : public ::testing::Test
                                                  DsLayout,
                                                  CLayout,
                                                  ck_tile::element_wise::PassThrough,
-                                                 GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
                                                  M_Warp,
@@ -124,8 +123,8 @@ class TestCkTileBatchedGemm : public ::testing::Test
             using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
-            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
+            const dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
             {
@@ -144,7 +143,7 @@ class TestCkTileBatchedGemm : public ::testing::Test
             }
 
             ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
             return ave_time;
         };
 
diff --git a/test/ck_tile/batched_transpose/test_batched_transpose.cpp b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
index 77d5825eed..8812397946 100644
--- a/test/ck_tile/batched_transpose/test_batched_transpose.cpp
+++ b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
@@ -137,11 +137,11 @@ class TestCkTileBatchedTranspose //              N    C    H    W    layout_in==
                                                                  Config::BlockTile::at(1)};
         auto kargs           = Kernel::MakeKargs(host_args);
 
-        auto sc                   = ck_tile::stream_config{};
-        const dim3 grid_size      = Kernel::GridSize(host_args);
-        constexpr dim3 block_size = Kernel::BlockSize();
-        ck_tile::launch_kernel(
-            sc, ck_tile::make_kernel<block_size.x, 1>(Kernel{}, grid_size, block_size, 0, kargs));
+        auto sc               = ck_tile::stream_config{};
+        const dim3 grid_size  = Kernel::GridSize(host_args);
+        const dim3 block_size = Kernel::BlockSize();
+        ck_tile::launch_kernel(sc,
+                               ck_tile::make_kernel<1>(Kernel{}, grid_size, block_size, 0, kargs));
 
         y_dev.FromDevice(y_host.data());
         ck_tile::reference_batched_transpose<DataType>(x_host, y_ref, layout_in, layout_out);
diff --git a/test/ck_tile/elementwise/test_elementwise_1d.cpp b/test/ck_tile/elementwise/test_elementwise_1d.cpp
index 7013792335..3ce6e78d1d 100644
--- a/test/ck_tile/elementwise/test_elementwise_1d.cpp
+++ b/test/ck_tile/elementwise/test_elementwise_1d.cpp
@@ -53,7 +53,7 @@ class TestCkTileElementwise : public ::testing::Test
     using BlockTile_        = std::tuple_element_t<5, Tuple>;
     using WarpTile_         = std::tuple_element_t<6, Tuple>;
     using TestElementWiseShape =
-        ck_tile::ElementWiseShape<BlockWarps_, BlockTile_, WarpTile_, ComputeDataType>;
+        ck_tile::ElementWiseShape<BlockWarps_, BlockTile_, WarpTile_, XDataType>;
     static constexpr int NumInputs = elementwise_op_traits<ElementwiseOpType>::num_inputs;
 
     void RunTest(ck_tile::index_t total_m_elements)
@@ -118,19 +118,17 @@ class TestCkTileElementwise : public ::testing::Test
                 "The kernel configuration is not supported for the given input size.");
         }
 
-        ck_tile::launch_kernel(
-            s,
-            ck_tile::make_kernel<TestElementWiseShape::kBlockSize, // MaxThreadPerBlock
-                                 kBlockPerCu>                      // MinBlockPerCu
-            (ew_kernel,
-             grid,
-             block,
-             0, // actual shared memory
-             lens,
-             strides, // input strides
-             strides, // output strides
-             d_x_ptrs_tuple,
-             p_y_device));
+        ck_tile::launch_kernel(s,
+                               ck_tile::make_kernel<kBlockPerCu> // MinBlockPerCu
+                               (ew_kernel,
+                                grid,
+                                block,
+                                0, // actual shared memory
+                                lens,
+                                strides, // input strides
+                                strides, // output strides
+                                d_x_ptrs_tuple,
+                                p_y_device));
 
         d_y_mem.FromDevice(h_y.data());
 
@@ -195,8 +193,7 @@ TYPED_TEST(TestCkTileElementwise, RunElementwise_1024) { this->RunTest(1024); }
 
 TYPED_TEST(TestCkTileElementwise, RunElementwise_513)
 {
-    EXPECT_THROW((this->RunTest(513)),
-                 std::runtime_error); // Test with an input size that's not a multiple of kVectorM
+    this->RunTest(513); // Test with an input size that's not a multiple of kVectorM
 }
 
 TYPED_TEST(TestCkTileElementwise, RunElementwise_516)
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index 6cbdc1a24e..a982e30a4c 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -30,6 +30,14 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
     target_compile_options(test_ck_tile_gemm_pipeline_basic_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     add_test_executable(test_ck_tile_gemm_pipeline_basic_bf8 test_gemm_pipeline_basic_bf8.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_basic_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+elseif(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
+    # On Radeon devices, build the WMMA version instead
+    add_gtest_executable(test_ck_tile_gemm_pipeline_mem_wmma test_gemm_pipeline_mem_wmma.cpp)
+    add_gtest_executable(test_ck_tile_gemm_pipeline_compv3_wmma test_gemm_pipeline_compv3_wmma.cpp)
+    add_gtest_executable(test_ck_tile_gemm_pipeline_compv4_wmma test_gemm_pipeline_compv4_wmma.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_mem_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_options(test_ck_tile_gemm_pipeline_compv3_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_options(test_ck_tile_gemm_pipeline_compv4_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
 else()
     message(DEBUG "Skipping ck_tile_gemm tests for current target")
 endif()
@@ -46,4 +54,7 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95" OR GPU_TARGETS MAT
     target_compile_options(test_ck_tile_gemm_pipeline_basic_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     add_test_executable(test_ck_tile_gemm_pipeline_basic_bf16 test_gemm_pipeline_basic_bf16.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_basic_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+elseif(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
+    add_gtest_executable(test_ck_tile_gemm_pipeline_persistent_wmma test_gemm_pipeline_persistent_wmma.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_persistent_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 endif()
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
index 4321709ea5..53eff9ecc4 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
@@ -77,7 +77,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                                              ck_tile::tuple<>,
                                              CLayout,
                                              ck_tile::element_wise::PassThrough,
-                                             CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
                                              M_Warp,
@@ -93,8 +92,8 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
         using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
         auto kargs   = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
         {
@@ -114,7 +113,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
         }
 
         float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     };
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
index 8944e6865d..370f4c16a8 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
@@ -3,7 +3,8 @@
 #include "gtest/gtest.h"
 
 template <typename T>
-class TestCkTileGemmPipelineCompV3 : public TestCkTileGemmPipeline<T>
+class TestCkTileGemmPipelineCompV3
+    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV3<T>>
 {
 };
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
new file mode 100644
index 0000000000..6bd98d0bc7
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
@@ -0,0 +1,17 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_wmma_base.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineCompV3Wmma
+    : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineCompV3Wmma<T>>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineCompV3Wmma
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV3Wmma, KernelTypesCompV3Wmma);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
index 22e77fac41..6d5a5b93d6 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
@@ -3,7 +3,8 @@
 #include "gtest/gtest.h"
 
 template <typename T>
-class TestCkTileGemmPipelineCompV4 : public TestCkTileGemmPipeline<T>
+class TestCkTileGemmPipelineCompV4
+    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV4<T>>
 {
 };
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
new file mode 100644
index 0000000000..f73901e761
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
@@ -0,0 +1,17 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_wmma_base.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineCompV4Wmma
+    : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineCompV4Wmma<T>>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineCompV4Wmma
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV4Wmma, KernelTypesCompV4Wmma);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
index ae8899ba71..a55cd100c1 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -9,13 +9,16 @@
 #include "ck_tile/host.hpp"
 #include "test_gemm_pipeline_util.hpp"
 
-using I8  = ck_tile::int8_t;
-using I32 = ck_tile::int32_t;
+using INT8  = ck_tile::int8_t;
+using INT32 = ck_tile::int32_t;
 
 using F16 = ck_tile::half_t;
 using F32 = float;
 using F8  = ck_tile::fp8_t;
 
+using BF16 = ck_tile::bf16_t;
+using BF8  = ck_tile::bf8_t;
+
 using Row       = ck_tile::tensor_layout::gemm::RowMajor;
 using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
 using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
@@ -30,52 +33,119 @@ using CompV4 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Co
 using Persistent    = std::true_type;
 using NonPersistent = std::false_type;
 
+using I16  = ck_tile::number<16>;
+using I32  = ck_tile::number<32>;
+using I64  = ck_tile::number<64>;
+using I256 = ck_tile::number<256>;
+
 // clang-format off
 using KernelTypesMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, M_BlockSize, N_BlockSize, K_BlockSize, M_TileSize, M_TileSize, K_TileSize, Scheduler, PipelineType
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>
+>;
+
+using KernelTypesMemWmma = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,       I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,       I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>
 >;
 
 using KernelTypesCompV3 = ::testing::Types<
-     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
-     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
-     std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
-     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Row,     Row,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>, 
-     std::tuple<    Row,     Col,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>,
-     std::tuple<    Col,     Row,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>, 
-     std::tuple<    Col,     Col,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>
-    
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>
+>;
+
+using KernelTypesCompV3Wmma = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>
 >;
 
 using KernelTypesCompV4 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>
 >;
 
+using KernelTypesCompV4Wmma = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV4>
+>;
+
+
 using KernelTypesPersistent = ::testing::Types<
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3,    Persistent>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3, NonPersistent>
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, M_BlockSize, N_BlockSize, K_BlockSize, M_TileSize, M_TileSize, K_TileSize, Scheduler,  PipelineType,    Persistent
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3,    Persistent>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3, NonPersistent>
+>;
+
+using KernelTypesPersistentWmma = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3,    Persistent>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3, NonPersistent>
 >;
 
 // clang-format on
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp b/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
index a7f4e68386..51fbebc915 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
@@ -3,7 +3,7 @@
 #include "gtest/gtest.h"
 
 template <typename T>
-class TestCkTileGemmPipelineMem : public TestCkTileGemmPipeline<T>
+class TestCkTileGemmPipelineMem : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineMem<T>>
 {
 };
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_mem_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_mem_wmma.cpp
new file mode 100644
index 0000000000..5af5e09b28
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_mem_wmma.cpp
@@ -0,0 +1,17 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_wmma_base.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineMemWmma
+    : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineMemWmma<T>>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineMemWmma
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineMemWmma, KernelTypesMemWmma);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
index 1dea1ab48c..54410acf70 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
@@ -3,7 +3,8 @@
 #include "gtest/gtest.h"
 
 template <typename T>
-class TestCkTileGemmPipelinePersistent : public TestCkTileGemmPipeline<T>
+class TestCkTileGemmPipelinePersistent
+    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelinePersistent<T>>
 {
 };
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent_wmma.cpp
new file mode 100644
index 0000000000..45ab586aa9
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent_wmma.cpp
@@ -0,0 +1,17 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_wmma_base.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelinePersistentWmma
+    : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelinePersistentWmma<T>>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelinePersistentWmma
+
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesPersistentWmma);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
index a22ecf2486..adae8dcf92 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
@@ -91,7 +91,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                                              DsLayout,
                                              ELayout,
                                              CDEElementWise,
-                                             GemmPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
                                              GemmConfig::M_Warp,
@@ -114,7 +113,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
         {
             grids = Kernel::GridSize(args.M, args.N, args.k_batch);
         }
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
         {
@@ -165,15 +164,13 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
             ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 run_flush_cache,
-                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                    Kernel{}, grids, blocks, 0, kargs));
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         }
         else
         {
-            ave_time =
-                ck_tile::launch_kernel(s,
-                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                           Kernel{}, grids, blocks, 0, kargs));
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         }
         return ave_time;
     };
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 70aa161881..af4f8d3d38 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/core/numeric/math.hpp"
 
 template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
@@ -69,7 +70,7 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompV4, Problem>
     static constexpr auto GetName() { return "GemmPipelineAgBgCrCompV4"; }
 };
 
-template <typename Tuple>
+template <typename Tuple, typename Derived>
 class TestCkTileGemmPipeline : public ::testing::Test
 {
     protected:
@@ -80,32 +81,30 @@ class TestCkTileGemmPipeline : public ::testing::Test
     using BDataType                    = std::tuple_element_t<4, Tuple>;
     using AccDataType                  = std::tuple_element_t<5, Tuple>;
     using CDataType                    = std::tuple_element_t<6, Tuple>;
-    static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
-    static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
+    static constexpr auto Scheduler    = std::tuple_element_t<13, Tuple>::value;
+    static constexpr auto PipelineType = std::tuple_element_t<14, Tuple>::value;
+
+    static constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, Tuple>{};
+    static constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, Tuple>{};
+    static constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, Tuple>{};
+
+    static constexpr ck_tile::index_t M_Warp_Tile = std::tuple_element_t<10, Tuple>{};
+    static constexpr ck_tile::index_t N_Warp_Tile = std::tuple_element_t<11, Tuple>{};
+    static constexpr ck_tile::index_t K_Warp_Tile = std::tuple_element_t<12, Tuple>{};
 
     using DsLayout   = ck_tile::tuple<>;
     using DsDataType = ck_tile::tuple<>;
 
     static constexpr bool Persistent =
-        ck_tile::tuple_element_or_default_t<Tuple, 9, std::false_type>::value;
-    // TODO: expose tile size through test t-param ?
+        ck_tile::tuple_element_or_default_t<Tuple, 15, std::false_type>::value;
 
     template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
     void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     {
-        // TODO: This should be parameterized in tests
-        constexpr ck_tile::index_t M_Tile = 256;
-        constexpr ck_tile::index_t N_Tile = 256;
-        constexpr ck_tile::index_t K_Tile = (PipelineType == GemmPipelineType::CompV4) ? 32 : 64;
-
         constexpr ck_tile::index_t M_Warp = 2;
         constexpr ck_tile::index_t N_Warp = 2;
         constexpr ck_tile::index_t K_Warp = 1;
 
-        constexpr ck_tile::index_t M_Warp_Tile = 32;
-        constexpr ck_tile::index_t N_Warp_Tile = 32;
-        constexpr ck_tile::index_t K_Warp_Tile = 16;
-
         constexpr bool kPadM      = PadM;
         constexpr bool kPadN      = PadN;
         constexpr bool kPadK      = PadK;
@@ -186,7 +185,6 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                                  DsLayout,
                                                  CLayout,
                                                  ck_tile::element_wise::PassThrough,
-                                                 GemmPipeline::BlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
                                                  M_Warp,
@@ -209,7 +207,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
             {
                 grids = Kernel::GridSize(args.M, args.N, args.k_batch);
             }
-            constexpr dim3 blocks = Kernel::BlockSize();
+            dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
             {
@@ -224,7 +222,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
             }
 
             ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         };
 
         const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
@@ -247,11 +245,48 @@ class TestCkTileGemmPipeline : public ::testing::Test
         BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
     }
 
+    template <typename ADataType,
+              typename BDataType,
+              typename AccDataType,
+              ck_tile::index_t M_Warp_Tile,
+              ck_tile::index_t N_Warp_Tile,
+              ck_tile::index_t K_Warp_Tile>
+    bool check_data_type()
+    {
+        return static_cast<Derived*>(this)
+            ->template check_data_type_impl<ADataType,
+                                            BDataType,
+                                            AccDataType,
+                                            M_Warp_Tile,
+                                            N_Warp_Tile,
+                                            K_Warp_Tile>();
+    }
+
+    template <typename ADataType,
+              typename BDataType,
+              typename AccDataType,
+              ck_tile::index_t M_Warp_Tile,
+              ck_tile::index_t N_Warp_Tile,
+              ck_tile::index_t K_Warp_Tile>
+    bool check_data_type_impl()
+    {
+        return true;
+    }
+
     public:
     std::vector<int> k_batches_;
 
     void SetUp() override
     {
+        if(!check_data_type<ADataType,
+                            BDataType,
+                            AccDataType,
+                            M_Warp_Tile,
+                            N_Warp_Tile,
+                            K_Warp_Tile>())
+        {
+            GTEST_SKIP() << "Unsupported data type combination for gemm pipeline test.";
+        }
         if constexpr(PipelineType == GemmPipelineType::CompV4)
         {
             // Only do k_batch = 1 when pipeline is CompV4
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp b/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
new file mode 100644
index 0000000000..8d8d245b6a
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "test_gemm_pipeline_util.hpp"
+
+template <typename Tuple, typename Derived>
+class TestCkTileGemmPipelineWmmaBase : public TestCkTileGemmPipeline<Tuple, Derived>
+{
+    public:
+    template <typename ADataType,
+              typename BDataType,
+              typename AccDataType,
+              ck_tile::index_t M_Warp_Tile,
+              ck_tile::index_t N_Warp_Tile,
+              ck_tile::index_t K_Warp_Tile>
+    bool check_data_type_impl()
+    {
+        return ck_tile::check_wmma_supported<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile>();
+    }
+};
diff --git a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
index a63a58b473..e8ff45fc5e 100644
--- a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
+++ b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
@@ -24,7 +24,8 @@ template <typename ADataType,
           typename ALayout,
           typename BLayout,
           typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
 {
     constexpr bool kPadM = false;
@@ -55,7 +56,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
     using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
 
     using CodegenGemmTraits =
-        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, Preshuffle, ALayout, BLayout, CLayout>;
 
     using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
                                                                  BDataType,
@@ -98,7 +99,6 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
                                                     ck_tile::tuple<>,
                                                     CLayout,
                                                     ck_tile::element_wise::PassThrough,
-                                                    CodegenPipelineProblem::kBlockSize,
                                                     TilePartitioner::MPerBlock,
                                                     TilePartitioner::NPerBlock,
                                                     M_Warp,
@@ -113,8 +113,8 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
 
         auto kargs = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = Kernel::BlockSize();
 
         if(args.k_batch != 1)
         {
@@ -138,7 +138,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
         }
 
         float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     };
@@ -161,7 +161,8 @@ template <typename ADataType,
           typename AQLayout,
           typename BLayout,
           typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   ck_tile::DeviceMem& aq_m_aqk_dev_buf,
                   ck_tile::DeviceMem& b_k_n_dev_buf,
@@ -202,7 +203,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                                       ALayout,
                                       BLayout,
                                       CLayout,
-                                      QuantGroupSize>(
+                                      QuantGroupSize,
+                                      Preshuffle>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop     = std::size_t(2) * M * N * K;
diff --git a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
index c08951435e..d21777c92b 100644
--- a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
+++ b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
@@ -178,7 +178,6 @@ class TestCkTileGemmMultiD : public ::testing::Test
                                                  DsLayout,
                                                  ELayout,
                                                  CDEElementWise,
-                                                 GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
                                                  M_Warp,
@@ -192,8 +191,8 @@ class TestCkTileGemmMultiD : public ::testing::Test
             using Kernel = ck_tile::GemmKernelMultiD<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
             {
@@ -212,7 +211,7 @@ class TestCkTileGemmMultiD : public ::testing::Test
             }
 
             ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
             return ave_time;
         };
 
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
old mode 100755
new mode 100644
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
index af229aad29..5d52f15696 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
@@ -183,7 +183,6 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                                  DsLayout,
                                                  CLayout,
                                                  ck_tile::element_wise::PassThrough,
-                                                 GemmPipeline::BlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
                                                  GemmConfig::M_Warp,
@@ -206,7 +205,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
             {
                 grids = Kernel::GridSize(args.M, args.N, args.k_batch);
             }
-            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
             {
@@ -221,7 +220,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
             }
 
             ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         };
 
         const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index cededd38f9..5aca02a433 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -136,7 +136,6 @@ class TestCkTileGroupedGemm : public ::testing::Test
                                                  DsLayout,
                                                  CLayout,
                                                  ck_tile::element_wise::PassThrough,
-                                                 GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
                                                  GroupedGemKernelParam::M_Warp,
@@ -150,8 +149,8 @@ class TestCkTileGroupedGemm : public ::testing::Test
             auto kargs   = Kernel::MakeKargs(gemm_descs);
             EXPECT_TRUE(Kernel::IsSupportedArgument(kargs));
 
-            const dim3 grids      = Kernel::GridSize(gemm_descs);
-            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids  = Kernel::GridSize(gemm_descs);
+            const dim3 blocks = Kernel::BlockSize();
 
             ck_tile::hip_check_error(hipMemcpyWithStream(kargs_ptr,
                                                          kargs.data(),
@@ -169,7 +168,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
 
             ave_time = ck_tile::launch_kernel(
                 s,
-                ck_tile::make_kernel<blocks.x, GroupedGemKernelParam::kBlockPerCu>(
+                ck_tile::make_kernel<GroupedGemKernelParam::kBlockPerCu>(
                     Kernel{},
                     grids,
                     blocks,
@@ -227,12 +226,6 @@ class TestCkTileGroupedGemm : public ::testing::Test
         using TilePartitioner = ck_tile::
             GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
-        using Traits = ck_tile::TileGemmTraits<GroupedGemKernelParam::kPadM,
-                                               GroupedGemKernelParam::kPadN,
-                                               GroupedGemKernelParam::kPadK,
-                                               ALayout,
-                                               BLayout,
-                                               CLayout>;
         using GemmUniversalTraits =
             ck_tile::PersistentTileGemmUniversalTraits<GroupedGemKernelParam::kPadM,
                                                        GroupedGemKernelParam::kPadN,
@@ -242,8 +235,6 @@ class TestCkTileGroupedGemm : public ::testing::Test
                                                        BLayout,
                                                        CLayout,
                                                        TransposeC>;
-        using GemmPipelineProblem =
-            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
         const auto Run = [&](const auto memory_operation_) {
             constexpr auto scheduler        = ck_tile::GemmPipelineScheduler::Intrawave;
@@ -268,7 +259,6 @@ class TestCkTileGroupedGemm : public ::testing::Test
                                                  DsLayout,
                                                  CLayout,
                                                  ck_tile::element_wise::PassThrough,
-                                                 GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
                                                  GroupedGemKernelParam::M_Warp,
@@ -279,8 +269,8 @@ class TestCkTileGroupedGemm : public ::testing::Test
                                                  UniversalGemmProblem::TransposeC,
                                                  memory_operation>>;
             using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            constexpr dim3 blocks = Kernel::BlockSize();
-            const dim3 grids      = Kernel::MaxOccupancyGridSize(s);
+            const dim3 blocks = Kernel::BlockSize();
+            const dim3 grids  = Kernel::MaxOccupancyGridSize(s);
 
             if(s.log_level_ > 0)
             {
@@ -291,7 +281,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
             }
 
             ck_tile::launch_kernel(s,
-                                   ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                   ck_tile::make_kernel<kBlockPerCu>(
                                        Kernel{},
                                        grids,
                                        blocks,
diff --git a/test/ck_tile/image_to_column/test_tile_image_to_column.cpp b/test/ck_tile/image_to_column/test_tile_image_to_column.cpp
index 9c0746e972..c721f1073f 100644
--- a/test/ck_tile/image_to_column/test_tile_image_to_column.cpp
+++ b/test/ck_tile/image_to_column/test_tile_image_to_column.cpp
@@ -97,13 +97,13 @@ class TestCkTileImageToColumn : public ::testing::Test
             kargs.N * kargs.output_spatial_lengths[0] * kargs.output_spatial_lengths[1],
             kargs.filter_spatial_lengths[0] * kargs.filter_spatial_lengths[1] * kargs.C,
             kargs.G);
-        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 blocks = Kernel::BlockSize();
 
         constexpr ck_tile::index_t kBlockPerCu = 2;
 
         ck_tile::launch_kernel(
             ck_tile::stream_config{},
-            ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
         // reference
         ck_tile::reference_im2col<DataType, DataType, NDimSpatial>(in, out_host, conv_params);
diff --git a/test/ck_tile/layernorm2d/generate.py b/test/ck_tile/layernorm2d/generate.py
index d77582630a..c4366f6662 100644
--- a/test/ck_tile/layernorm2d/generate.py
+++ b/test/ck_tile/layernorm2d/generate.py
@@ -235,7 +235,7 @@ float layernorm2d_fwd_(const S& s, A a)
     using Kernel = ck_tile::Layernorm2dFwd<Pipeline, Epilogue>;
 
     const dim3 grids                       = Kernel::GridSize(a);
-    constexpr dim3 blocks                  = Kernel::BlockSize();
+    const dim3 blocks                      = Kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
     auto kargs = Kernel::MakeKargs(a);
@@ -243,7 +243,7 @@ float layernorm2d_fwd_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
 }}
 
 """
diff --git a/test/ck_tile/memory_copy/README.md b/test/ck_tile/memory_copy/README.md
index 7856f0b4bd..9c56052b64 100644
--- a/test/ck_tile/memory_copy/README.md
+++ b/test/ck_tile/memory_copy/README.md
@@ -12,7 +12,7 @@ is moved to output DRAM window for a simple copy operation.
 mkdir build && cd build
 # you can replace <arch> with the appropriate architecture 
 # (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
+../script/cmake-ck-dev.sh  ../ <arch>
 # Make the copy kernel executable
 make test_copy -j
 ```
diff --git a/test/ck_tile/memory_copy/test_copy.cpp b/test/ck_tile/memory_copy/test_copy.cpp
index e8962dce29..30a2e60ea9 100644
--- a/test/ck_tile/memory_copy/test_copy.cpp
+++ b/test/ck_tile/memory_copy/test_copy.cpp
@@ -76,17 +76,17 @@ class TestCkTileMemoryCopy : public ::testing::TestWithParam<std::tuple<int, int
         constexpr ck_tile::index_t kBlockSize  = 128;
         constexpr ck_tile::index_t kBlockPerCu = 1;
 
-        auto ms = launch_kernel(ck_tile::stream_config{nullptr, true},
-                                ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                    Kernel{},
-                                    kGridSize,
-                                    kBlockSize,
-                                    0,
-                                    static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
-                                    static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
-                                    m,
-                                    n,
-                                    warp_id));
+        auto ms = launch_kernel(
+            ck_tile::stream_config{nullptr, true},
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{},
+                                              kGridSize,
+                                              kBlockSize,
+                                              0,
+                                              static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                              static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                              m,
+                                              n,
+                                              warp_id));
 
         auto bytes = 2 * m * n * sizeof(DataType);
         std::cout << "elapsed: " << ms << " (ms)" << std::endl;
diff --git a/test/ck_tile/memory_copy/test_copy.hpp b/test/ck_tile/memory_copy/test_copy.hpp
index a9840ba2c6..4833b29560 100644
--- a/test/ck_tile/memory_copy/test_copy.hpp
+++ b/test/ck_tile/memory_copy/test_copy.hpp
@@ -64,7 +64,8 @@ struct TileCopy
     using Problem   = ck_tile::remove_cvref_t<Problem_>;
     using XDataType = typename Problem::XDataType;
 
-    static constexpr bool AsyncCopy = Problem::AsyncCopy;
+    static constexpr index_t kBlockSize = Problem::BlockShape::BlockSize;
+    static constexpr bool AsyncCopy     = Problem::AsyncCopy;
 
     template <typename Problem>
     CK_TILE_DEVICE static constexpr auto MakeDRAMDistribution()
diff --git a/test/ck_tile/moe_smoothquant/CMakeLists.txt b/test/ck_tile/moe_smoothquant/CMakeLists.txt
index 70999fa06b..b6c8a395b6 100644
--- a/test/ck_tile/moe_smoothquant/CMakeLists.txt
+++ b/test/ck_tile/moe_smoothquant/CMakeLists.txt
@@ -2,7 +2,7 @@
 if(GPU_TARGETS MATCHES "gfx9")
     function (add_moe_smoothquant_test TARGET_NAME MAIN_SRC)
         message(DEBUG "adding ${TARGET_NAME}")
-        add_test_executable(${TARGET_NAME} ${MAIN_SRC})
+        add_gtest_executable(${TARGET_NAME} ${MAIN_SRC})
         target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 
         foreach(source IN LISTS ARGN)
@@ -21,11 +21,7 @@ if(GPU_TARGETS MATCHES "gfx9")
 
     file(GLOB INSTANCE_SRCS instances/*.cpp)
 
-    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_fp16_fp8 moe_smoothquant_fp16_fp8.cpp ${INSTANCE_SRCS})
-    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_fp16_int8 moe_smoothquant_fp16_int8.cpp ${INSTANCE_SRCS})
-
-    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_bf16_fp8 moe_smoothquant_bf16_fp8.cpp ${INSTANCE_SRCS})
-    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_bf16_int8 moe_smoothquant_bf16_int8.cpp ${INSTANCE_SRCS})
+    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant test_moe_smoothquant.cpp ${INSTANCE_SRCS})
 
 else()
     message(DEBUG "Skipping ck_tile MOE smoothquant tests for current target")
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
index 0b890ab3ac..60c640d930 100644
--- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
@@ -24,9 +24,7 @@ using trait_ = moe_smoothquant_traits_<InType,
                                        kTwoPass_>;
 
 template <typename in_type, typename out_type>
-float moe_smoothquant_dispatch(moe_smoothquant_traits /*t*/,
-                               moe_smoothquant_args a,
-                               const ck_tile::stream_config& s)
+float moe_smoothquant_dispatch(moe_smoothquant_args a, const ck_tile::stream_config& s)
 {
     float r = -1;
     // clang-format off
@@ -130,26 +128,30 @@ float moe_smoothquant_dispatch(moe_smoothquant_traits /*t*/,
     // clang-format on
 }
 
-float moe_smoothquant(moe_smoothquant_traits t,
-                      moe_smoothquant_args a,
-                      const ck_tile::stream_config& s)
+template <>
+float moe_smoothquant<ck_tile::fp16_t, ck_tile::int8_t>(moe_smoothquant_args a,
+                                                        const ck_tile::stream_config& s)
 {
-    if(t.in_type.compare("fp16") == 0 && t.out_type == "int8")
-    {
-        return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::int8_t>(t, a, s);
-    }
-    else if(t.in_type.compare("fp16") == 0 && t.out_type == "fp8")
-    {
-        return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::fp8_t>(t, a, s);
-    }
-    else if(t.in_type.compare("bf16") == 0 && t.out_type == "int8")
-    {
-        return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::int8_t>(t, a, s);
-    }
-    else if(t.in_type.compare("bf16") == 0 && t.out_type == "fp8")
-    {
-        return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::fp8_t>(t, a, s);
-    }
-    else
-        throw std::runtime_error("Without supported instances!");
-}
+    return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::int8_t>(a, s);
+};
+
+template <>
+float moe_smoothquant<ck_tile::fp16_t, ck_tile::fp8_t>(moe_smoothquant_args a,
+                                                       const ck_tile::stream_config& s)
+{
+    return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::fp8_t>(a, s);
+};
+
+template <>
+float moe_smoothquant<ck_tile::bf16_t, ck_tile::int8_t>(moe_smoothquant_args a,
+                                                        const ck_tile::stream_config& s)
+{
+    return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::int8_t>(a, s);
+};
+
+template <>
+float moe_smoothquant<ck_tile::bf16_t, ck_tile::fp8_t>(moe_smoothquant_args a,
+                                                       const ck_tile::stream_config& s)
+{
+    return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::fp8_t>(a, s);
+};
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
index 9d8c9caf00..f2875c72c8 100644
--- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
@@ -61,5 +61,5 @@ float moe_smoothquant_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 }
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
index d137e64cb4..ced9b4ef3d 100644
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
@@ -95,10 +95,5 @@ template <typename Traits_>
 float moe_smoothquant_(const ck_tile::stream_config& s, moe_smoothquant_args a);
 
 // This is the public API, will be generated by script
-struct moe_smoothquant_traits
-{
-    std::string in_type;  // input type
-    std::string out_type; // output type
-};
-
-float moe_smoothquant(moe_smoothquant_traits, moe_smoothquant_args, const ck_tile::stream_config&);
+template <typename InputType, typename OutputType>
+float moe_smoothquant(moe_smoothquant_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.inc b/test/ck_tile/moe_smoothquant/moe_smoothquant.inc
deleted file mode 100644
index 9e181a9d8c..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant.inc
+++ /dev/null
@@ -1,317 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "ck_tile/host.hpp"
-#include "moe_smoothquant.hpp"
-#include <cstring>
-#include <set>
-#include <hip/hip_runtime.h>
-
-// different threshold for different dtype
-template <typename DataType>
-auto get_elimit()
-{
-    double rtol = 1e-5;
-    double atol = 1e-5;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::bf16_t>()
-{
-    double rtol = 1e-5;
-    double atol = 1e-5;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::int8_t>()
-{
-    // due to rounding, int8 quantization might have 1 abs error
-    double rtol = 1;
-    double atol = 1;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <typename IndexType>
-void topid_unique_gen(
-    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
-{
-    size_t total_size = topk * tokens;
-    std::srand(seed);
-    std::set<IndexType> unique_set;
-    IndexType current_v;
-    for(size_t i = 0; i < total_size; i++)
-    {
-        if(i % topk == 0)
-        {
-            unique_set.clear();
-        }
-        current_v = std::rand() % num_expert;
-        while(unique_set.find(current_v) != unique_set.end())
-        {
-            current_v = std::rand() % num_expert;
-        }
-        unique_set.insert(current_v);
-        host_tensor[i] = current_v;
-    }
-}
-
-auto create_args(int argc, char* argv[], int index = 0)
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("t", "3328", "tokens dimension")
-        .insert("h", "4096", "hidden_size dimension")
-        .insert("e", "32", "experts")
-        .insert("k", "5", "topk")
-        .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size")
-        .insert("v", "1", "cpu validation or not")
-        .insert("kname", "1", "print kernel name or not")
-        .insert("prec_i", "fp16", "input precision, fp16/bf16")
-        .insert("prec_o", "int8", "precision, int8/fp8")
-        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
-
-    bool result = arg_parser.parse(argc, argv, index);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename InputType, typename OutputType>
-bool run(const ck_tile::ArgParser& arg_parser)
-{
-    ck_tile::index_t tokens      = arg_parser.get_int("t");
-    ck_tile::index_t hidden_size = arg_parser.get_int("h");
-    ck_tile::index_t stride      = arg_parser.get_int("stride");
-    if(stride < 0)
-        stride = hidden_size;
-    ck_tile::index_t experts = arg_parser.get_int("e");
-    ck_tile::index_t topk    = arg_parser.get_int("k");
-    std::string prec_i       = arg_parser.get_str("prec_i");
-    std::string prec_o       = arg_parser.get_str("prec_o");
-    int kname                = arg_parser.get_int("kname");
-    int do_validation        = arg_parser.get_int("v");
-    int warmup               = arg_parser.get_int("warmup");
-    int repeat               = arg_parser.get_int("repeat");
-
-    assert(stride >= hidden_size);
-
-    using TypeConfig = MoeSmoothquantTypeConfig<InputType, OutputType>;
-
-    using XDataType           = typename TypeConfig::XDataType;
-    using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
-    using YScaleDataType      = typename TypeConfig::YScaleDataType;
-    using QYDataType          = typename TypeConfig::QYDataType;
-    using ComputeDataType     = typename TypeConfig::ComputeDataType;
-
-    // host verify
-    ck_tile::HostTensor<XDataType> x_host({tokens, hidden_size}, {stride, 1});
-    ck_tile::HostTensor<SmoothScaleDataType> smscale_host({experts * hidden_size});
-    ck_tile::HostTensor<ck_tile::index_t> topk_ids_host({tokens, topk});
-
-    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({topk * tokens}, {1});
-    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({topk * tokens}, {1});
-
-    ck_tile::HostTensor<QYDataType> qy_host_ref({topk * tokens, hidden_size}, {stride, 1});
-    ck_tile::HostTensor<QYDataType> qy_host_dev({topk * tokens, hidden_size}, {stride, 1});
-
-    topid_unique_gen<ck_tile::index_t>(topk_ids_host.mData, tokens, topk, experts, 11937);
-    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
-    ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
-
-    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem topk_ids_buf(topk_ids_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
-
-    x_buf.ToDevice(x_host.data());
-    smscale_buf.ToDevice(smscale_host.data());
-    topk_ids_buf.ToDevice(topk_ids_host.data());
-
-    std::cout << "[" << prec_i << "-" << prec_o << "]" << " tokens:" << tokens
-              << ", hidden_size:" << hidden_size << ", stride:" << stride << ", experts:" << experts
-              << ", topk:" << topk << std::flush;
-
-    moe_smoothquant_traits traits{prec_i, prec_o};
-
-    moe_smoothquant_args args{x_buf.GetDeviceBuffer(),
-                              smscale_buf.GetDeviceBuffer(),
-                              topk_ids_buf.GetDeviceBuffer(),
-                              yscale_buf.GetDeviceBuffer(),
-                              qy_buf.GetDeviceBuffer(),
-                              tokens,
-                              hidden_size,
-                              experts,
-                              topk,
-                              stride,
-                              stride};
-
-    float ave_time = moe_smoothquant(
-        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
-
-    std::size_t num_byte = sizeof(XDataType) * tokens * hidden_size +
-                           sizeof(SmoothScaleDataType) * topk * hidden_size +
-                           sizeof(YScaleDataType) * topk * tokens +
-                           sizeof(QYDataType) * topk * tokens * hidden_size;
-
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
-
-    bool pass = true;
-
-    if(do_validation)
-    {
-        using YDataType = ComputeDataType;
-        ck_tile::HostTensor<ComputeDataType> y_host({topk * tokens, hidden_size}, {stride, 1});
-        // smooth outlier
-        {
-            auto f = [&](auto i_token) {
-                for(int i_topk = 0; i_topk < topk; i_topk++)
-                {
-                    auto i_expert = topk_ids_host(i_token, i_topk);
-
-                    for(int i_h = 0; i_h < hidden_size; ++i_h)
-                    {
-                        auto v_smscale = ck_tile::type_convert<ComputeDataType>(
-                            smscale_host(i_expert * hidden_size + i_h));
-                        auto v_x = ck_tile::type_convert<ComputeDataType>(x_host(i_token, i_h));
-                        // y_host(i_token * topk + i_topk, i_h) = v_x * v_smscale;
-                        y_host(i_topk * tokens + i_token, i_h) = v_x * v_smscale;
-                    }
-                }
-            };
-
-            ck_tile::make_ParallelTensorFunctor(f, tokens)(std::thread::hardware_concurrency());
-        }
-
-        // yscale
-        {
-            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({topk * tokens});
-
-            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
-            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
-                y_host, y_rowwise_amax_host, ReduceAmax{});
-
-            auto op = [](const auto& v0) {
-                return v0 /
-                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
-            };
-            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
-                y_rowwise_amax_host, yscale_host_ref, op);
-
-            yscale_buf.FromDevice(yscale_host_dev.mData.data());
-
-            auto [rtol, atol] = get_elimit<YScaleDataType>();
-            pass &= ck_tile::check_err(yscale_host_dev,
-                                       yscale_host_ref,
-                                       std::string("yscale Error: Incorrect results!"),
-                                       rtol,
-                                       atol);
-        }
-
-        // rowwise quantization
-        {
-            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
-                y_host, yscale_host_ref, qy_host_ref);
-
-            qy_buf.FromDevice(qy_host_dev.data());
-            auto [rtol, atol] = get_elimit<QYDataType>();
-
-            if(stride == hidden_size)
-            {
-                pass = ck_tile::check_err(qy_host_dev,
-                                          qy_host_ref,
-                                          std::string("qy Error: Incorrect results!"),
-                                          rtol,
-                                          atol);
-            }
-            else
-            {
-                for(int i_r = 0; i_r < topk * tokens; i_r++)
-                {
-                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
-                                                            qy_host_dev.begin() + i_r * stride +
-                                                                hidden_size);
-                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
-                                                            qy_host_ref.begin() + i_r * stride +
-                                                                hidden_size);
-                    pass &= ck_tile::check_err(qy_host_dev_row,
-                                               qy_host_ref_row,
-                                               std::string("qy[") + std::to_string(i_r) +
-                                                   std::string("] Error: Incorrect results!"),
-                                               rtol,
-                                               atol);
-                }
-            }
-        }
-
-        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
-    }
-
-    return pass;
-}
-
-std::vector<std::vector<std::string>> generate_test_cases(const std::string prec_in,
-                                                          const std::string prec_out)
-{
-    return {{"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=99", "-h=13", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=17", "-h=16", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=100", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=4", "-h=128", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=80", "-h=127", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=22", "-h=255", "-stride=256"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=7", "-h=599", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=19", "-h=512", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=33", "-h=313", "-stride=1000"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=11", "-h=510", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=171", "-h=676", "-stride=818"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=12", "-h=768", "-stride=800"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=100", "-h=766", "-stride=812"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=31", "-h=1024", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=64", "-h=1000", "-stride=1004"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=8", "-h=1501", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=1826", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=5", "-h=2040", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=7", "-h=2734", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=3182", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=9", "-h=4096", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=8192", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=10547", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=17134", "-stride=-1"}};
-}
-
-template <typename InputType, typename OutputType>
-bool run_test_case(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return false;
-
-    return run<InputType, OutputType>(arg_parser);
-}
-
-template <typename InputType, typename OutputType>
-bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
-{
-    bool valid             = true;
-    constexpr int num_args = 5;
-    char* argv[num_args];
-
-    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
-    {
-
-        assert(num_args == test_cases[test_idx].size() && "invalid number of arguments");
-
-        for(int arg_idx = 0; arg_idx < num_args; ++arg_idx)
-        {
-            argv[arg_idx] = test_cases[test_idx][arg_idx].data();
-        }
-
-        valid = valid && run_test_case<InputType, OutputType>(num_args, argv);
-
-        if(!valid)
-            break;
-    }
-
-    return valid;
-}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp
deleted file mode 100644
index 3b5350da4b..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "moe_smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("bf16", "fp8");
-
-    return !run_test_cases<ck_tile::bf16_t, ck_tile::fp8_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp
deleted file mode 100644
index 4751273f1d..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "moe_smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("bf16", "int8");
-
-    return !run_test_cases<ck_tile::bf16_t, ck_tile::int8_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp
deleted file mode 100644
index b9932dee65..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "moe_smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp16", "fp8");
-
-    return !run_test_cases<ck_tile::half_t, ck_tile::fp8_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp
deleted file mode 100644
index 91c53b77bc..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "moe_smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp16", "int8");
-
-    return !run_test_cases<ck_tile::half_t, ck_tile::int8_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_smoothquant/test_moe_smoothquant.cpp b/test/ck_tile/moe_smoothquant/test_moe_smoothquant.cpp
new file mode 100644
index 0000000000..dcd7ba2d26
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/test_moe_smoothquant.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_moe_smoothquant_types.hpp"
+#include "test_moe_smoothquant_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTileMoeSmoothquant
+
+TYPED_TEST_SUITE(TestCkTileMoeSmoothquant, KernelTypesMoeSmoothquant);
+
+#include "test_moe_smoothquant_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/moe_smoothquant/test_moe_smoothquant_cases.inc b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_cases.inc
new file mode 100644
index 0000000000..12e8b5edc6
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_cases.inc
@@ -0,0 +1,206 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#ifndef TEST_MOE_SMOOTHQUANT_CASES_INC
+#define TEST_MOE_SMOOTHQUANT_CASES_INC
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t99_h13)
+{
+    ck_tile::index_t tokens      = 99;
+    ck_tile::index_t hidden_size = 13;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t17_h16)
+{
+    ck_tile::index_t tokens      = 17;
+    ck_tile::index_t hidden_size = 16;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t1_h100)
+{
+    ck_tile::index_t tokens      = 1;
+    ck_tile::index_t hidden_size = 100;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t4_h128)
+{
+    ck_tile::index_t tokens      = 4;
+    ck_tile::index_t hidden_size = 128;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t80_h127)
+{
+    ck_tile::index_t tokens      = 80;
+    ck_tile::index_t hidden_size = 127;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t22_h255)
+{
+    ck_tile::index_t tokens      = 22;
+    ck_tile::index_t hidden_size = 255;
+    ck_tile::index_t stride      = 256;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t7_h599)
+{
+    ck_tile::index_t tokens      = 7;
+    ck_tile::index_t hidden_size = 599;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t19_h512)
+{
+    ck_tile::index_t tokens      = 19;
+    ck_tile::index_t hidden_size = 512;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t33_h313)
+{
+    ck_tile::index_t tokens      = 33;
+    ck_tile::index_t hidden_size = 313;
+    ck_tile::index_t stride      = 1000;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t11_h510)
+{
+    ck_tile::index_t tokens      = 11;
+    ck_tile::index_t hidden_size = 510;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t171_h676)
+{
+    ck_tile::index_t tokens      = 171;
+    ck_tile::index_t hidden_size = 676;
+    ck_tile::index_t stride      = 818;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t12_h768)
+{
+    ck_tile::index_t tokens      = 12;
+    ck_tile::index_t hidden_size = 768;
+    ck_tile::index_t stride      = 800;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t100_h766)
+{
+    ck_tile::index_t tokens      = 100;
+    ck_tile::index_t hidden_size = 766;
+    ck_tile::index_t stride      = 812;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t31_h1024)
+{
+    ck_tile::index_t tokens      = 31;
+    ck_tile::index_t hidden_size = 1024;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t64_h1000)
+{
+    ck_tile::index_t tokens      = 64;
+    ck_tile::index_t hidden_size = 1000;
+    ck_tile::index_t stride      = 1004;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t8_h1501)
+{
+    ck_tile::index_t tokens      = 8;
+    ck_tile::index_t hidden_size = 1501;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t3_h1826)
+{
+    ck_tile::index_t tokens      = 3;
+    ck_tile::index_t hidden_size = 1826;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t5_h2040)
+{
+    ck_tile::index_t tokens      = 5;
+    ck_tile::index_t hidden_size = 2040;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t7_h2734)
+{
+    ck_tile::index_t tokens      = 7;
+    ck_tile::index_t hidden_size = 2734;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t1_h3182)
+{
+    ck_tile::index_t tokens      = 1;
+    ck_tile::index_t hidden_size = 3182;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t9_h4096)
+{
+    ck_tile::index_t tokens      = 9;
+    ck_tile::index_t hidden_size = 4096;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t3_h8192)
+{
+    ck_tile::index_t tokens      = 3;
+    ck_tile::index_t hidden_size = 8192;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t1_h10547)
+{
+    ck_tile::index_t tokens      = 1;
+    ck_tile::index_t hidden_size = 10547;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t3_h17134)
+{
+    ck_tile::index_t tokens      = 3;
+    ck_tile::index_t hidden_size = 17134;
+
+    this->Run(tokens, hidden_size);
+}
+
+#endif
diff --git a/test/ck_tile/moe_smoothquant/test_moe_smoothquant_types.hpp b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_types.hpp
new file mode 100644
index 0000000000..7855def63d
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_types.hpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include "ck_tile/host.hpp"
+#include "gtest/gtest.h"
+
+using KernelTypesMoeSmoothquant = ::testing::Types<std::tuple<ck_tile::bf16_t, ck_tile::fp8_t>,
+                                                   std::tuple<ck_tile::bf16_t, ck_tile::int8_t>,
+                                                   std::tuple<ck_tile::fp16_t, ck_tile::fp8_t>,
+                                                   std::tuple<ck_tile::fp16_t, ck_tile::int8_t>>;
diff --git a/test/ck_tile/moe_smoothquant/test_moe_smoothquant_util.hpp b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_util.hpp
new file mode 100644
index 0000000000..18993a6e97
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_util.hpp
@@ -0,0 +1,218 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "ck_tile/host.hpp"
+#include "moe_smoothquant.hpp"
+#include <cstring>
+#include <set>
+#include <hip/hip_runtime.h>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+template <typename Tuple>
+class TestCkTileMoeSmoothquant : public ::testing::Test
+{
+    protected:
+    using InputType  = std::tuple_element_t<0, Tuple>;
+    using OutputType = std::tuple_element_t<1, Tuple>;
+
+    void Run(ck_tile::index_t tokens,
+             ck_tile::index_t hidden_size,
+             ck_tile::index_t stride  = -1,
+             ck_tile::index_t experts = 32,
+             ck_tile::index_t topk    = 5)
+    {
+        if(stride < 0)
+            stride = hidden_size;
+
+        assert(stride >= hidden_size);
+
+        using TypeConfig = MoeSmoothquantTypeConfig<InputType, OutputType>;
+
+        using XDataType           = typename TypeConfig::XDataType;
+        using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
+        using YScaleDataType      = typename TypeConfig::YScaleDataType;
+        using QYDataType          = typename TypeConfig::QYDataType;
+        using ComputeDataType     = typename TypeConfig::ComputeDataType;
+
+        // host verify
+        ck_tile::HostTensor<XDataType> x_host({tokens, hidden_size}, {stride, 1});
+        ck_tile::HostTensor<SmoothScaleDataType> smscale_host({experts * hidden_size});
+        ck_tile::HostTensor<ck_tile::index_t> topk_ids_host({tokens, topk});
+
+        ck_tile::HostTensor<YScaleDataType> yscale_host_ref({topk * tokens}, {1});
+        ck_tile::HostTensor<YScaleDataType> yscale_host_dev({topk * tokens}, {1});
+
+        ck_tile::HostTensor<QYDataType> qy_host_ref({topk * tokens, hidden_size}, {stride, 1});
+        ck_tile::HostTensor<QYDataType> qy_host_dev({topk * tokens, hidden_size}, {stride, 1});
+
+        topid_unique_gen<ck_tile::index_t>(topk_ids_host.mData, tokens, topk, experts, 11937);
+        ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+        ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
+
+        ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem topk_ids_buf(topk_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+        x_buf.ToDevice(x_host.data());
+        smscale_buf.ToDevice(smscale_host.data());
+        topk_ids_buf.ToDevice(topk_ids_host.data());
+
+        std::cout << "tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride
+                  << ", experts:" << experts << ", topk:" << topk << std::flush;
+
+        moe_smoothquant_args args{x_buf.GetDeviceBuffer(),
+                                  smscale_buf.GetDeviceBuffer(),
+                                  topk_ids_buf.GetDeviceBuffer(),
+                                  yscale_buf.GetDeviceBuffer(),
+                                  qy_buf.GetDeviceBuffer(),
+                                  tokens,
+                                  hidden_size,
+                                  experts,
+                                  topk,
+                                  stride,
+                                  stride};
+
+        moe_smoothquant<InputType, OutputType>(args, ck_tile::stream_config{nullptr, false});
+
+        bool pass = true;
+
+        using YDataType = ComputeDataType;
+        ck_tile::HostTensor<ComputeDataType> y_host({topk * tokens, hidden_size}, {stride, 1});
+        // smooth outlier
+        {
+            auto f = [&](auto i_token) {
+                for(int i_topk = 0; i_topk < topk; i_topk++)
+                {
+                    auto i_expert = topk_ids_host(i_token, i_topk);
+
+                    for(int i_h = 0; i_h < hidden_size; ++i_h)
+                    {
+                        auto v_smscale = ck_tile::type_convert<ComputeDataType>(
+                            smscale_host(i_expert * hidden_size + i_h));
+                        auto v_x = ck_tile::type_convert<ComputeDataType>(x_host(i_token, i_h));
+                        // y_host(i_token * topk + i_topk, i_h) = v_x * v_smscale;
+                        y_host(i_topk * tokens + i_token, i_h) = v_x * v_smscale;
+                    }
+                }
+            };
+
+            ck_tile::make_ParallelTensorFunctor(f, tokens)(std::thread::hardware_concurrency());
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({topk * tokens});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(stride == hidden_size)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < topk * tokens; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
+                                                            qy_host_dev.begin() + i_r * stride +
+                                                                hidden_size);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
+                                                            qy_host_ref.begin() + i_r * stride +
+                                                                hidden_size);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+
+        EXPECT_TRUE(pass);
+    }
+};
diff --git a/test/ck_tile/moe_sorting/CMakeLists.txt b/test/ck_tile/moe_sorting/CMakeLists.txt
index 9a7490f0c9..5abc7df5a9 100644
--- a/test/ck_tile/moe_sorting/CMakeLists.txt
+++ b/test/ck_tile/moe_sorting/CMakeLists.txt
@@ -1,14 +1,19 @@
 # Currently ck_tile is only built on gfx90a, gfx942 and gfx950
 if(GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950" OR GPU_TARGETS MATCHES "gfx90a")
 
-    add_test_executable(test_ck_tile_moe_sorting_fp32 moe_sorting_fp32.cpp moe_sorting_api.cpp)
-    target_include_directories(test_ck_tile_moe_sorting_fp32 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+    function(add_moe_sorting_test EXECUTABLE USE_2D_BUF)
+        add_gtest_executable(${EXECUTABLE} test_moe_sorting.cpp moe_sorting_api.cpp)
+        target_include_directories(${EXECUTABLE} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
 
-    set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
-    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
-    list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
-    # list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
-    target_compile_options(test_ck_tile_moe_sorting_fp32 PRIVATE ${EXAMPLE_MOE_SORTING_COMPILE_OPTIONS})
+        set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
+        # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+        list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal -DMOE_SORTING_FMOE_2D_BUF=${USE_2D_BUF})
+        target_compile_options(${EXECUTABLE} PRIVATE ${EXAMPLE_MOE_SORTING_COMPILE_OPTIONS})
+
+    endfunction(add_moe_sorting_test EXECUTABLE USE_2D_BUF)
+
+    add_moe_sorting_test(test_ck_tile_moe_sorting_2d_buf 1)
+    add_moe_sorting_test(test_ck_tile_moe_sorting 0)
 
 else()
     message(DEBUG "Skipping ck_tile_moe_sorting tests for current target")
diff --git a/test/ck_tile/moe_sorting/moe_sorting_api.cpp b/test/ck_tile/moe_sorting/moe_sorting_api.cpp
index 0f25e17867..0cf600d2b4 100644
--- a/test/ck_tile/moe_sorting/moe_sorting_api.cpp
+++ b/test/ck_tile/moe_sorting/moe_sorting_api.cpp
@@ -209,7 +209,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
         const dim3 blocks                     = kernel::BlockSize(a);                               \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
     }()
 
 #define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
@@ -227,7 +227,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
         const dim3 blocks                     = kernel::BlockSize(a);                               \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
     }()
 #if MOE_SORTING_SUPPORT_LARGE_EXPERT
 #define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
@@ -283,7 +283,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         const dim3 grids                      = kernel::GridSize(a);                                 \
         const dim3 blocks                     = kernel::BlockSize(a);                                \
         const auto lds_size                   = kernel::GetSmemSize(a);                              \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, lds_size, kargs);                       \
     }()
 
 #define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)            \
@@ -334,15 +334,15 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         }                                                                                          \
     }
 
-#define MOR_SORTING_CLEAR_WS_DISPATCH_(is_local_token_, block_size_, occu_)                 \
-    [&]() {                                                                                 \
-        using problem_ =                                                                    \
-            ck_tile::MoeSortingClearWorkspaceProblem<is_local_token_, block_size_, occu_>;  \
-        using kernel      = ck_tile::MoeSortingClearWorkspaceKernel<problem_>;              \
-        auto kargs        = kernel::MakeKargs(a);                                           \
-        const dim3 grids  = kernel::GridSize(a);                                            \
-        const dim3 blocks = kernel::BlockSize(a);                                           \
-        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs); \
+#define MOR_SORTING_CLEAR_WS_DISPATCH_(is_local_token_, block_size_, occu_)                \
+    [&]() {                                                                                \
+        using problem_ =                                                                   \
+            ck_tile::MoeSortingClearWorkspaceProblem<is_local_token_, block_size_, occu_>; \
+        using kernel      = ck_tile::MoeSortingClearWorkspaceKernel<problem_>;             \
+        auto kargs        = kernel::MakeKargs(a);                                          \
+        const dim3 grids  = kernel::GridSize(a);                                           \
+        const dim3 blocks = kernel::BlockSize(a);                                          \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                    \
     }()
 
 float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
diff --git a/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp b/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
deleted file mode 100644
index 8a300dd890..0000000000
--- a/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
+++ /dev/null
@@ -1,544 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include <set>
-#include <vector>
-#include <iostream>
-#include <numeric>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <time.h>
-#include <unordered_set>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/reduce.hpp"
-#include "moe_sorting_api.hpp"
-
-auto create_args(int argc, char* argv[], int index = 0)
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "turn CPU validation on (1) or off (0).")
-        .insert("pr_i", "int32", "index data type.  Only int32 is currently supported.")
-        .insert("pr_w", "fp32", "output weight data type. Only fp32 is currently supported.")
-        .insert("t",
-                "128",
-                "number of input tokens.\n"
-                "If \"local_t\" presents, this value indicates global concurrency of all ranks.")
-        .insert(
-            "local_t",
-            "-1",
-            "Number of local input tokens for curent rank.\n"
-            "This value must be within range \"[0, t)\", or \"-1\"(no such feature)\n"
-            "This feature is to simulate EP case where where each rank has different tokens.\n"
-            "Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.")
-        .insert("e", "8", "number of num_experts")
-        .insert("k", "4", "topk")
-        .insert("unit", "32", "unit_size")
-#if MOE_SORTING_FMOE_2D_BUF
-        .insert("moe_buf_interm_dim", "0", "interm_dim(col) of the following fmoe buf")
-        .insert(
-            "moe_buf_elem_bytes", "2", "fmoe buf element byte size, 1:8bit, 2:16bit, 4:32bit...")
-#else
-        .insert("moe_buf_size", "0", "moe_buf_size")
-#endif
-        .insert("ci",
-                "1",
-                "clear workspace inside API or not(if \"0\", require manually clear outside)")
-        .insert(
-            "dispatch",
-            "0",
-            "dispatch policy. 0:automatically pick up kernel, 1:use single kernel, 2:use mp kernel")
-        .insert("local_eid",
-                "-1",
-                "a list of experts enabled as local expert. e.g. \"0,1,4,5\"\n"
-                "please make sure eid is in ascending order!")
-        .insert("seed",
-                "-1",
-                "seed to be used. When set to -1, a random seed will be generated each time "
-                "invoking this example")
-        .insert("kname", "0", "prints the kernel name when set to 1")
-        .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
-
-    bool result = arg_parser.parse(argc, argv, index);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename IndexType>
-void topid_unique_gen(
-    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
-{
-    size_t total_size = topk * tokens;
-    std::srand(seed);
-    std::set<IndexType> unique_set;
-    IndexType current_v;
-    for(size_t i = 0; i < total_size; i++)
-    {
-        if(i % topk == 0)
-        {
-            unique_set.clear();
-        }
-        current_v = std::rand() % num_expert;
-        while(unique_set.find(current_v) != unique_set.end())
-        {
-            current_v = std::rand() % num_expert;
-        }
-        unique_set.insert(current_v);
-        host_tensor[i] = current_v;
-    }
-}
-
-template <typename WeightType, typename IndexType = ck_tile::index_t>
-bool test_moe_sorting(ck_tile::ArgParser args)
-{
-    int validate            = args.get_int("v");
-    std::string index_prec  = args.get_str("pr_i");
-    std::string weight_prec = args.get_str("pr_w");
-    int tokens              = args.get_int("t");
-    int local_tokens        = args.get_int("local_t");
-    int num_experts         = args.get_int("e");
-    int topk                = args.get_int("k");
-    int seed                = args.get_int("seed");
-    int unit_size           = args.get_int("unit");
-#if MOE_SORTING_FMOE_2D_BUF
-    int moe_buf_interm_dim = args.get_int("moe_buf_interm_dim");
-    int moe_buf_elem_bytes = args.get_int("moe_buf_elem_bytes");
-#else
-    int64_t moe_buf_size = static_cast<int64_t>(args.get_uint64("moe_buf_size"));
-#endif
-    int kname           = args.get_int("kname");
-    int warmup          = args.get_int("warmup");
-    int repeat          = args.get_int("repeat");
-    bool clear_inside   = args.get_int("ci") != 0;
-    int dispatch_policy = args.get_int("dispatch");
-
-    int max_output_ids =
-        ck_tile::integer_least_multiple(topk * tokens + num_experts * unit_size - topk, unit_size);
-
-    if(seed < 0)
-    {
-        seed = std::time(nullptr);
-    }
-
-    if(topk > num_experts)
-    {
-        printf("topk:%d value should be smaller than, or equal to number of num_experts:%d\n",
-               topk,
-               num_experts);
-        return false;
-    }
-
-    // if local_tokens == tokens, not local_token, but better avoid this since no meaning for such
-    // case
-    bool is_local_token = local_tokens >= 0 && local_tokens < tokens;
-
-    if(local_tokens > tokens)
-    {
-        printf("local_tokens:%d larger than tokens:%d, invalid\n", local_tokens, tokens);
-        return false;
-    }
-
-    bool local_expert_masking      = args.get_str("local_eid") != "-1";
-    auto local_expert_masking_host = [&]() {
-        if(local_expert_masking)
-        {
-            auto local_eid = args.get_int_vec("local_eid");
-            ck_tile::HostTensor<IndexType> v_{{num_experts}};
-            v_.SetZero();
-            for(auto eid : local_eid)
-            {
-                if(eid >= num_experts)
-                {
-                    throw std::runtime_error(
-                        "local_eid larger than number of expert, please check");
-                }
-                v_.mData[eid] = 1;
-            }
-            return v_;
-        }
-        else
-            return ck_tile::HostTensor<IndexType>{{1}};
-    }();
-
-    // tokens already considered batch size
-    ck_tile::HostTensor<IndexType> topk_ids_host({tokens, topk}, {topk, 1});
-    ck_tile::HostTensor<WeightType> weights_host({tokens, topk}, {topk, 1});
-    ck_tile::HostTensor<IndexType> sorted_ids_host({max_output_ids}, {1});
-    ck_tile::HostTensor<WeightType> sorted_weights_host({max_output_ids}, {1});
-    ck_tile::HostTensor<IndexType> sorted_expert_ids_host({max_output_ids / unit_size}, {1});
-    // for simplicity, below buffer allocate 2 dword
-    ck_tile::HostTensor<IndexType> sorted_id_cnt_host({2}, {1});
-#if MOE_SORTING_FMOE_2D_BUF
-    ck_tile::HostTensor<int8_t> moe_buf_host(
-        {static_cast<std::size_t>(is_local_token ? local_tokens : tokens) * moe_buf_interm_dim *
-         moe_buf_elem_bytes});
-    auto moe_buf_bytes = moe_buf_interm_dim == 0 ? static_cast<std::size_t>(0)
-                                                 : moe_buf_host.get_element_space_size_in_bytes();
-#else
-    ck_tile::HostTensor<float> moe_buf_host({moe_buf_size});
-    auto moe_buf_bytes = moe_buf_size == 0 ? static_cast<std::size_t>(0)
-                                           : moe_buf_host.get_element_space_size_in_bytes();
-#endif
-
-    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(weights_host);
-#if MOE_SORTING_FMOE_2D_BUF
-    ck_tile::FillUniformDistribution<int8_t>{-.5f, .5f}(moe_buf_host);
-#else
-    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(moe_buf_host);
-#endif
-    topid_unique_gen<IndexType>(topk_ids_host.mData, tokens, topk, num_experts, seed);
-
-    ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem weights_dev(weights_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem sorted_ids_dev(sorted_ids_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem sorted_weights_dev(sorted_weights_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem sorted_expert_ids_dev(
-        sorted_expert_ids_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem sorted_id_cnt_dev(sorted_id_cnt_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem moe_buf_dev(moe_buf_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem local_expert_masking_dev(
-        local_expert_masking_host.get_element_space_size_in_bytes());
-
-    // used for simulating dynamic_tokens for EP case
-    ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
-    if(is_local_token)
-    {
-        local_tokens_dev.ToDevice(&local_tokens);
-    }
-
-    topk_ids_dev.ToDevice(topk_ids_host.data());
-    weights_dev.ToDevice(weights_host.data());
-    if(moe_buf_bytes > 0)
-    {
-        moe_buf_dev.ToDevice(moe_buf_host.data());
-    }
-    if(local_expert_masking)
-        local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
-
-    // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
-    ck_tile::index_t workspace_size =
-        moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
-    ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
-    if(workspace_size != 0 && clear_inside == false)
-        moe_sorting_ws.SetZero(); // note, clear here!!!!
-
-    moe_sorting_trait trait{
-        index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};
-
-    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
-                          weights_dev.GetDeviceBuffer(),
-                          local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
-                                               : nullptr,
-                          is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
-                          sorted_ids_dev.GetDeviceBuffer(),
-                          sorted_weights_dev.GetDeviceBuffer(),
-                          sorted_expert_ids_dev.GetDeviceBuffer(),
-                          sorted_id_cnt_dev.GetDeviceBuffer(),
-                          moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
-                          workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
-                          tokens,
-                          unit_size,
-                          num_experts,
-                          topk,
-#if MOE_SORTING_FMOE_2D_BUF
-                          moe_buf_interm_dim,
-                          moe_buf_elem_bytes
-#else
-                          static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
-#endif
-    };
-
-    ck_tile::stream_config sc{nullptr,
-                              true,
-                              /* log_level = */ (kname ? 1 : 0),
-                              warmup,
-                              repeat};
-
-    auto ms = moe_sorting(trait, karg, sc);
-
-    printf("[%s|%s|%s|%d]tokens:%d",
-           index_prec.c_str(),
-           weight_prec.c_str(),
-           workspace_size == 0 ? "cx" : (clear_inside ? "ci" : "co"),
-           dispatch_policy,
-           tokens);
-    if(is_local_token)
-    {
-        printf("(%d)", local_tokens);
-    }
-    printf(", num_experts:%d, topk:%d, mp:%d, ", num_experts, topk, workspace_size != 0 ? 1 : 0);
-
-    if(local_expert_masking)
-    {
-        printf("local_eid:%s, ", args.get_str("local_eid").c_str());
-    }
-
-    if(moe_buf_bytes > 0)
-    {
-#if MOE_SORTING_FMOE_2D_BUF
-        printf("moe_buf:%lu(%d,%d), ",
-               static_cast<uint64_t>(moe_buf_bytes),
-               moe_buf_interm_dim,
-               moe_buf_elem_bytes);
-#else
-
-        printf("moe_buf:%lu, ", static_cast<uint64_t>(moe_buf_bytes));
-#endif
-    }
-
-    if(ms < 0)
-        printf("not supported\n");
-    else
-        printf("ms:%f, ", ms);
-    fflush(stdout);
-    if(ms < 0)
-    {
-        return false;
-    }
-
-    sorted_ids_dev.FromDevice(sorted_ids_host.data());
-    sorted_weights_dev.FromDevice(sorted_weights_host.data());
-    sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data());
-    sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data());
-    if(moe_buf_bytes > 0)
-    {
-        moe_buf_dev.FromDevice(moe_buf_host.data());
-    }
-
-    bool rtn = true;
-    if(validate)
-    {
-        ck_tile::HostTensor<IndexType> sorted_ids_ref({max_output_ids}, {1});
-        ck_tile::HostTensor<WeightType> sorted_weights_ref({max_output_ids}, {1});
-        ck_tile::HostTensor<IndexType> sorted_expert_ids_ref({max_output_ids / unit_size}, {1});
-
-        int32_t ref_total_tokens_post_pad = 0;
-        ck_tile::reference_moe_sorting<WeightType, IndexType>(topk_ids_host,
-                                                              weights_host,
-                                                              local_expert_masking_host,
-                                                              sorted_ids_ref,
-                                                              sorted_weights_ref,
-                                                              sorted_expert_ids_ref,
-                                                              ref_total_tokens_post_pad,
-                                                              num_experts,
-                                                              unit_size,
-                                                              is_local_token ? local_tokens
-                                                                             : tokens,
-                                                              local_expert_masking);
-        printf("total_tokens_post_pad:%d(%d), ",
-               ref_total_tokens_post_pad,
-               sorted_id_cnt_host.mData[0]);
-        if(ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0])
-        {
-            size_t slen = ref_total_tokens_post_pad;
-            rtn &= ck_tile::check_err(sorted_ids_host.slice({0}, {slen}),
-                                      sorted_ids_ref.slice({0}, {slen}),
-                                      std::string("OUT Error: Incorrect ids!"),
-                                      1e-6,
-                                      1e-6);
-            rtn &= ck_tile::check_err(sorted_weights_host.slice({0}, {slen}),
-                                      sorted_weights_ref.slice({0}, {slen}),
-                                      std::string("OUT Error: Incorrect w!"),
-                                      1e-6,
-                                      1e-6);
-            rtn &= ck_tile::check_err(sorted_expert_ids_host.slice({0}, {slen / unit_size}),
-                                      sorted_expert_ids_ref.slice({0}, {slen / unit_size}),
-                                      std::string("OUT Error: Incorrect eid!"),
-                                      1e-6,
-                                      1e-6);
-            // if(is_local_token)
-            {
-                auto t_ = is_local_token ? local_tokens : tokens;
-                bool _f = t_ == sorted_id_cnt_host.mData[1];
-                rtn &= _f;
-                if(!_f)
-                {
-                    printf("not equal token buffer pad %d(%d)\n", t_, sorted_id_cnt_host.mData[1]);
-                }
-            }
-        }
-        else
-        {
-            printf("(token size not equal!!)");
-            rtn = false;
-        }
-
-        if(moe_buf_bytes)
-        {
-#if MOE_SORTING_FMOE_2D_BUF
-            ck_tile::HostTensor<int8_t> moe_buf_ref({moe_buf_bytes});
-#else
-            ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
-#endif
-            rtn &= ck_tile::check_err(
-                moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
-        }
-        // rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
-    }
-
-    printf("valid:%s", rtn ? "y" : "n");
-    fflush(stdout);
-    if(!rtn)
-        printf(", (%d)", seed);
-    printf("\n");
-    fflush(stdout);
-    return rtn;
-}
-template <typename WeightType, typename IndexType = ck_tile::index_t>
-bool run_test_case(int argc, char* argv[])
-{
-    auto [result, args] = create_args(argc, argv);
-    if(!result)
-        return false;
-
-    return test_moe_sorting<WeightType, IndexType>(args);
-}
-
-template <typename WeightType, typename IndexType = ck_tile::index_t>
-bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
-{
-    bool valid = true;
-
-    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
-    {
-
-        constexpr int max_num_args = 7;
-        const int num_args         = test_cases[test_idx].size();
-
-        assert(max_num_args >= num_args && "Invalid number of arguments in test case");
-
-        char* argv[max_num_args];
-
-        for(int arg_idx = 0; arg_idx < num_args; ++arg_idx)
-        {
-            argv[arg_idx] = test_cases[test_idx][arg_idx].data();
-        }
-
-        try
-        {
-            valid = valid && run_test_case<WeightType, IndexType>(num_args, argv);
-
-            if(!valid)
-                break;
-        }
-        catch(const std::runtime_error& e)
-        {
-            std::cerr << "Runtime error: " << e.what() << '\n';
-            return false;
-        }
-    }
-
-    return valid;
-}
-
-std::vector<std::vector<std::string>> create_test_cases()
-{
-#if MOE_SORTING_FMOE_2D_BUF
-    return {{"-t=80", "-e=17", "-moe_buf_interm_dim=16", "-moe_buf_elem_bytes=4"},
-            {"-t=111", "-e=117", "-moe_buf_interm_dim=4", "-moe_buf_elem_bytes=4"},
-            {"-t=1000", "-e=55", "-moe_buf_interm_dim=1024", "-moe_buf_elem_bytes=1"},
-            {"-t=99", "-e=120", "-moe_buf_interm_dim=10244", "-moe_buf_elem_bytes=2"},
-            {"-t=175", "-e=64", "-k=8"},
-            {"-t=65", "-e=8", "-k=2"},
-            {"-t=1", "-e=25"},
-            {"-t=31", "-e=19", "-k=15"},
-            {"-t=81", "-e=37", "-k=7"},
-            {"-t=23", "-e=1", "-k=1"},
-            {"-t=127", "-e=99", "-k=19"},
-            {"-t=71", "-e=11", "-k=11"},
-            {"-t=1", "-e=1", "-k=1"},
-            {"-t=99", "-e=2", "-k=1"},
-            {"-t=333", "-e=99", "-k=13"},
-            {"-t=11", "-e=256", "-k=5"},
-            {"-t=64", "-e=455", "-k=8"},
-            {"-t=777", "-e=802", "-k=99"},
-            {"-t=4097", "-e=906", "-k=51"},
-            {"-t=128", "-e=32", "-k=5", "-local_t=6", "-moe_buf_interm_dim=262144"},
-            {"-t=13", "-e=64", "-k=3", "-local_eid=4,5,6,7,8,9,10,11"},
-            {"-t=99", "-e=33", "-k=9", "-local_eid=6,10,11,15,19"},
-            {"-t=80", "-e=99", "-k=10", "-local_eid=0,8,12,33"},
-            {"-t=11", "-e=256", "-k=5", "-local_eid=99,110,129"},
-            {"-t=128", "-e=128", "-k=6", "-moe_buf_interm_dim=163840", "-moe_buf_elem_bytes=1"},
-            {"-t=8192", "-e=32", "-k=5", "-local_t=11", "-moe_buf_interm_dim=163840"},
-            {"-t=8192",
-             "-e=32",
-             "-k=8",
-             "-local_t=12",
-             "-moe_buf_interm_dim=163840",
-             "-moe_buf_elem_bytes=1"},
-            {"-t=8192", "-e=256", "-k=5", "-local_t=13", "-moe_buf_interm_dim=163840"},
-            {"-t=8192", "-e=256", "-k=8", "-local_t=8", "-moe_buf_interm_dim=163840"},
-            {"-t=163840",
-             "-e=256",
-             "-k=8",
-             "-local_t=4",
-             "-moe_buf_interm_dim=163840",
-             "-moe_buf_elem_bytes=4"},
-            {"-t=12", "-local_t=3", "-e=256", "-k=5", "-local_eid=9,10,199,145"},
-            {"-t=67", "-local_t=9", "-e=555", "-k=5", "-local_eid=19,23,24,25,26,99"},
-            {"-t=99", "-local_t=93", "-e=121", "-local_t=4", "-moe_buf_interm_dim=10244"},
-            {"-t=536", "-local_t=345", "-e=802", "-k=99"},
-            {"-t=331", "-local_t=39", "-e=83", "-k=33"},
-            {"-t=765", "-local_t=654", "-e=783", "-k=8"},
-            {"-t=23", "-local_t=9", "-e=1", "-k=1"},
-            {"-t=7", "-local_t=0", "-e=89", "-k=1", "-local_eid=0,8,12,33"},
-            {"-t=61", "-local_t=0", "-e=333", "-k=99", "-local_eid=0,8,12,33"},
-            {"-t=133940",
-             "-local_t=111921",
-             "-e=256",
-             "-k=17",
-             "-local_t=2",
-             "-moe_buf_interm_dim=133940",
-             "-moe_buf_elem_bytes=1"}};
-
-#else
-    return {{"-t=80", "-e=17", "-moe_buf_size=16"},
-            {"-t=111", "-e=117", "-moe_buf_size=4"},
-            {"-t=1000", "-e=55", "-moe_buf_size=1024"},
-            {"-t=99", "-e=120", "-moe_buf_size=10244"},
-            {"-t=175", "-e=64", "-k=8"},
-            {"-t=65", "-e=8", "-k=2"},
-            {"-t=1", "-e=25"},
-            {"-t=31", "-e=19", "-k=15"},
-            {"-t=81", "-e=37", "-k=7"},
-            {"-t=23", "-e=1", "-k=1"},
-            {"-t=127", "-e=99", "-k=19"},
-            {"-t=71", "-e=11", "-k=11"},
-            {"-t=1", "-e=1", "-k=1"},
-            {"-t=99", "-e=2", "-k=1"},
-            {"-t=333", "-e=99", "-k=13"},
-            {"-t=11", "-e=256", "-k=5"},
-            {"-t=64", "-e=455", "-k=8"},
-            {"-t=777", "-e=802", "-k=99"},
-            {"-t=4097", "-e=906", "-k=51"},
-            {"-t=128", "-e=32", "-k=5", "-moe_buf_size=262144"},
-            {"-t=13", "-e=64", "-k=3", "-local_eid=4,5,6,7,8,9,10,11"},
-            {"-t=99", "-e=33", "-k=9", "-local_eid=6,10,11,15,19"},
-            {"-t=80", "-e=99", "-k=10", "-local_eid=0,8,12,33"},
-            {"-t=11", "-e=256", "-k=5", "-local_eid=99,110,129"},
-            {"-t=128", "-e=128", "-k=6", "-moe_buf_size=163840"},
-            {"-t=8192", "-e=32", "-k=5", "-moe_buf_size=163840"},
-            {"-t=8192", "-e=32", "-k=8", "-moe_buf_size=163840"},
-            {"-t=8192", "-e=256", "-k=5", "-moe_buf_size=163840"},
-            {"-t=8192", "-e=256", "-k=8", "-moe_buf_size=163840"},
-            {"-t=163840", "-e=256", "-k=8", "-moe_buf_size=163840"},
-            {"-t=12", "-local_t=3", "-e=256", "-k=5", "-local_eid=9,10,199,145"},
-            {"-t=67", "-local_t=9", "-e=555", "-k=5", "-local_eid=19,23,24,25,26,99"},
-            {"-t=99", "-local_t=93", "-e=121", "-moe_buf_size=10244"},
-            {"-t=536", "-local_t=345", "-e=802", "-k=99"},
-            {"-t=331", "-local_t=39", "-e=83", "-k=33"},
-            {"-t=765", "-local_t=654", "-e=783", "-k=8"},
-            {"-t=23", "-local_t=9", "-e=1", "-k=1"},
-            {"-t=7", "-local_t=0", "-e=89", "-k=1", "-local_eid=0,8,12,33"},
-            {"-t=61", "-local_t=0", "-e=333", "-k=99", "-local_eid=0,8,12,33"},
-            {"-t=133940", "-local_t=111921", "-e=256", "-k=17", "-moe_buf_size=133940"}};
-#endif
-}
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases();
-
-    return !run_test_cases<float, ck_tile::index_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_sorting/test_moe_sorting.cpp b/test/ck_tile/moe_sorting/test_moe_sorting.cpp
new file mode 100644
index 0000000000..8f6cb72c24
--- /dev/null
+++ b/test/ck_tile/moe_sorting/test_moe_sorting.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_moe_sorting_types.hpp"
+#include "test_moe_sorting_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTileMoeSorting
+
+TYPED_TEST_SUITE(TestCkTileMoeSorting, KernelTypesMoeSorting);
+
+#include "test_moe_sorting_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/moe_sorting/test_moe_sorting_cases.inc b/test/ck_tile/moe_sorting/test_moe_sorting_cases.inc
new file mode 100755
index 0000000000..4d44e7101e
--- /dev/null
+++ b/test/ck_tile/moe_sorting/test_moe_sorting_cases.inc
@@ -0,0 +1,1211 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#ifndef TEST_MOE_SORTING_CASES_INC
+#define TEST_MOE_SORTING_CASES_INC
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase1)
+{
+    int tokens       = 80;
+    int local_tokens = -1;
+    int num_experts  = 17;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 16;
+    int moe_buf_elem_bytes = 4;
+#else
+    int64_t moe_buf_size = 16;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase2)
+{
+    int tokens       = 111;
+    int local_tokens = -1;
+    int num_experts  = 117;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 4;
+    int moe_buf_elem_bytes = 4;
+#else
+    int64_t moe_buf_size = 4;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase3)
+{
+    int tokens       = 1000;
+    int local_tokens = -1;
+    int num_experts  = 55;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 1024;
+    int moe_buf_elem_bytes = 1;
+#else
+    int64_t moe_buf_size = 1024;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase4)
+{
+    int tokens       = 99;
+    int local_tokens = -1;
+    int num_experts  = 120;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 10244;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 10244;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase5)
+{
+    int tokens       = 175;
+    int local_tokens = -1;
+    int num_experts  = 8;
+    int topk         = 8;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase6)
+{
+    int tokens       = 65;
+    int local_tokens = -1;
+    int num_experts  = 8;
+    int topk         = 2;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase7)
+{
+    int tokens       = 1;
+    int local_tokens = -1;
+    int num_experts  = 65;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase8)
+{
+    int tokens       = 31;
+    int local_tokens = -1;
+    int num_experts  = 19;
+    int topk         = 15;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase9)
+{
+    int tokens       = 81;
+    int local_tokens = -1;
+    int num_experts  = 37;
+    int topk         = 7;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase10)
+{
+    int tokens       = 23;
+    int local_tokens = -1;
+    int num_experts  = 1;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase11)
+{
+    int tokens       = 127;
+    int local_tokens = -1;
+    int num_experts  = 99;
+    int topk         = 19;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase12)
+{
+    int tokens       = 71;
+    int local_tokens = -1;
+    int num_experts  = 11;
+    int topk         = 11;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase13)
+{
+    int tokens       = 1;
+    int local_tokens = -1;
+    int num_experts  = 1;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase14)
+{
+    int tokens       = 99;
+    int local_tokens = -1;
+    int num_experts  = 2;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase15)
+{
+    int tokens       = 333;
+    int local_tokens = -1;
+    int num_experts  = 99;
+    int topk         = 13;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase16)
+{
+    int tokens       = 11;
+    int local_tokens = -1;
+    int num_experts  = 256;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase17)
+{
+    int tokens       = 64;
+    int local_tokens = -1;
+    int num_experts  = 455;
+    int topk         = 8;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase18)
+{
+    int tokens       = 777;
+    int local_tokens = -1;
+    int num_experts  = 802;
+    int topk         = 99;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase19)
+{
+    int tokens       = 4097;
+    int local_tokens = -1;
+    int num_experts  = 906;
+    int topk         = 51;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase20)
+{
+    int tokens       = 128;
+    int local_tokens = 6;
+    int num_experts  = 32;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 262144;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 262144;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase21)
+{
+    int tokens       = 13;
+    int local_tokens = -1;
+    int num_experts  = 64;
+    int topk         = 3;
+    int unit_size    = 32;
+    std::vector<int> local_eid{4, 5, 6, 7, 8, 9, 10, 11};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase22)
+{
+    int tokens       = 99;
+    int local_tokens = -1;
+    int num_experts  = 33;
+    int topk         = 9;
+    int unit_size    = 32;
+    std::vector<int> local_eid{6, 10, 11, 15, 19};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase23)
+{
+    int tokens       = 80;
+    int local_tokens = -1;
+    int num_experts  = 99;
+    int topk         = 10;
+    int unit_size    = 32;
+    std::vector<int> local_eid{0, 8, 12, 33};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase24)
+{
+    int tokens       = 11;
+    int local_tokens = -1;
+    int num_experts  = 256;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{99, 110, 129};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase25)
+{
+    int tokens       = 128;
+    int local_tokens = -1;
+    int num_experts  = 128;
+    int topk         = 6;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 1;
+#else
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase26)
+{
+    int tokens      = 8192;
+    int num_experts = 32;
+    int topk        = 5;
+    int unit_size   = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int local_tokens       = 11;
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 2;
+#else
+    int local_tokens     = -1;
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase27)
+{
+    int tokens       = 8192;
+    int local_tokens = 12;
+    int num_experts  = 32;
+    int topk         = 8;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 1;
+#else
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase28)
+{
+    int tokens      = 8192;
+    int num_experts = 256;
+    int topk        = 5;
+    int unit_size   = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int local_tokens       = 12;
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 0;
+#else
+    int local_tokens     = -1;
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase29)
+{
+    int tokens      = 8192;
+    int num_experts = 256;
+    int topk        = 8;
+    int unit_size   = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int local_tokens       = 8;
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 2;
+#else
+    int local_tokens     = -1;
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase30)
+{
+    int tokens      = 163840;
+    int num_experts = 256;
+    int topk        = 8;
+    int unit_size   = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int local_tokens       = 4;
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 4;
+#else
+    int local_tokens     = -1;
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase31)
+{
+    int tokens       = 12;
+    int local_tokens = 3;
+    int num_experts  = 256;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{9, 10, 199, 145};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase32)
+{
+    int tokens       = 67;
+    int local_tokens = 9;
+    int num_experts  = 555;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{19, 23, 24, 25, 26, 99};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase33)
+{
+    int tokens       = 99;
+    int local_tokens = 93;
+    int num_experts  = 121;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 10244;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 10244;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase34)
+{
+    int tokens       = 536;
+    int local_tokens = 345;
+    int num_experts  = 802;
+    int topk         = 99;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase35)
+{
+    int tokens       = 331;
+    int local_tokens = 39;
+    int num_experts  = 83;
+    int topk         = 33;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase36)
+{
+    int tokens       = 765;
+    int local_tokens = 654;
+    int num_experts  = 783;
+    int topk         = 8;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase37)
+{
+    int tokens       = 23;
+    int local_tokens = 9;
+    int num_experts  = 1;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase38)
+{
+    int tokens       = 7;
+    int local_tokens = 0;
+    int num_experts  = 89;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{0, 8, 12, 33};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase39)
+{
+    int tokens       = 61;
+    int local_tokens = 0;
+    int num_experts  = 333;
+    int topk         = 99;
+    int unit_size    = 32;
+    std::vector<int> local_eid{0, 8, 12, 33};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase40)
+{
+    int tokens       = 133940;
+    int local_tokens = 2;
+    int num_experts  = 256;
+    int topk         = 17;
+    int unit_size    = 32;
+    std::vector<int> local_eid{0, 8, 12, 33};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 133940;
+    int moe_buf_elem_bytes = 1;
+#else
+    int64_t moe_buf_size = 133940;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+#endif
diff --git a/test/ck_tile/moe_sorting/test_moe_sorting_types.hpp b/test/ck_tile/moe_sorting/test_moe_sorting_types.hpp
new file mode 100644
index 0000000000..447e48abb6
--- /dev/null
+++ b/test/ck_tile/moe_sorting/test_moe_sorting_types.hpp
@@ -0,0 +1,8 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include "ck_tile/host.hpp"
+#include "gtest/gtest.h"
+
+using KernelTypesMoeSorting = ::testing::Types<std::tuple<float, ck_tile::index_t>>;
diff --git a/test/ck_tile/moe_sorting/test_moe_sorting_util.hpp b/test/ck_tile/moe_sorting/test_moe_sorting_util.hpp
new file mode 100644
index 0000000000..5d58dcac7a
--- /dev/null
+++ b/test/ck_tile/moe_sorting/test_moe_sorting_util.hpp
@@ -0,0 +1,356 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <set>
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "moe_sorting_api.hpp"
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+void print_vector(std::vector<int>& data)
+{
+    for(const auto& x : data)
+    {
+        std::cout << x << ",";
+    }
+    std::cout << " ";
+}
+
+template <typename Tuple>
+class TestCkTileMoeSorting : public ::testing::Test
+{
+
+    protected:
+    using WeightType = std::tuple_element_t<0, Tuple>;
+    using IndexType  = std::tuple_element_t<1, Tuple>;
+
+    void RunSingle(int tokens,
+                   int local_tokens,
+                   int num_experts,
+                   int topk,
+                   int unit_size,
+                   std::vector<int>& local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                   int moe_buf_interm_dim,
+                   int moe_buf_elem_bytes)
+#else
+                   int64_t moe_buf_size)
+#endif
+    {
+        std::string index_prec  = get_precision_string<IndexType>();
+        std::string weight_prec = get_precision_string<WeightType>();
+
+        bool clear_inside   = true;
+        int dispatch_policy = 0;
+
+        int max_output_ids = ck_tile::integer_least_multiple(
+            topk * tokens + num_experts * unit_size - topk, unit_size);
+
+        int seed = 42; // Fixed seed for testing reproducibility
+
+        if(topk > num_experts)
+        {
+            printf("topk:%d value should be smaller than, or equal to number of num_experts:%d\n",
+                   topk,
+                   num_experts);
+            EXPECT_TRUE(false);
+        }
+
+        // if local_tokens == tokens, not local_token, but better avoid this since no meaning for
+        // such case
+        bool is_local_token = local_tokens >= 0 && local_tokens < tokens;
+
+        if(local_tokens > tokens)
+        {
+            printf("local_tokens:%d larger than tokens:%d, invalid\n", local_tokens, tokens);
+            EXPECT_TRUE(false);
+        }
+
+        bool local_expert_masking      = !local_eid.empty();
+        auto local_expert_masking_host = [&]() {
+            if(local_expert_masking)
+            {
+                // auto local_eid = args.get_int_vec("local_eid");
+                ck_tile::HostTensor<IndexType> v_{{num_experts}};
+                v_.SetZero();
+                for(auto eid : local_eid)
+                {
+                    if(eid >= num_experts)
+                    {
+                        throw std::runtime_error(
+                            "local_eid larger than number of expert, please check");
+                    }
+                    v_.mData[eid] = 1;
+                }
+                return v_;
+            }
+            else
+                return ck_tile::HostTensor<IndexType>{{1}};
+        }();
+
+        // tokens already considered batch size
+        ck_tile::HostTensor<IndexType> topk_ids_host({tokens, topk}, {topk, 1});
+        ck_tile::HostTensor<WeightType> weights_host({tokens, topk}, {topk, 1});
+        ck_tile::HostTensor<IndexType> sorted_ids_host({max_output_ids}, {1});
+        ck_tile::HostTensor<WeightType> sorted_weights_host({max_output_ids}, {1});
+        ck_tile::HostTensor<IndexType> sorted_expert_ids_host({max_output_ids / unit_size}, {1});
+        // for simplicity, below buffer allocate 2 dword
+        ck_tile::HostTensor<IndexType> sorted_id_cnt_host({2}, {1});
+#if MOE_SORTING_FMOE_2D_BUF
+        ck_tile::HostTensor<int8_t> moe_buf_host(
+            {static_cast<std::size_t>(is_local_token ? local_tokens : tokens) * moe_buf_interm_dim *
+             moe_buf_elem_bytes});
+        auto moe_buf_bytes = moe_buf_interm_dim == 0
+                                 ? static_cast<std::size_t>(0)
+                                 : moe_buf_host.get_element_space_size_in_bytes();
+#else
+        ck_tile::HostTensor<float> moe_buf_host({moe_buf_size});
+        auto moe_buf_bytes = moe_buf_size == 0 ? static_cast<std::size_t>(0)
+                                               : moe_buf_host.get_element_space_size_in_bytes();
+#endif
+
+        ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(weights_host);
+#if MOE_SORTING_FMOE_2D_BUF
+        ck_tile::FillUniformDistribution<int8_t>{-.5f, .5f}(moe_buf_host);
+#else
+        ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(moe_buf_host);
+#endif
+        topid_unique_gen<IndexType>(topk_ids_host.mData, tokens, topk, num_experts, seed);
+
+        ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem weights_dev(weights_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_ids_dev(sorted_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_weights_dev(
+            sorted_weights_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_expert_ids_dev(
+            sorted_expert_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_id_cnt_dev(sorted_id_cnt_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem moe_buf_dev(moe_buf_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem local_expert_masking_dev(
+            local_expert_masking_host.get_element_space_size_in_bytes());
+
+        // used for simulating dynamic_tokens for EP case
+        ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
+        if(is_local_token)
+        {
+            local_tokens_dev.ToDevice(&local_tokens);
+        }
+
+        topk_ids_dev.ToDevice(topk_ids_host.data());
+        weights_dev.ToDevice(weights_host.data());
+        if(moe_buf_bytes > 0)
+        {
+            moe_buf_dev.ToDevice(moe_buf_host.data());
+        }
+        if(local_expert_masking)
+            local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
+
+        // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
+        ck_tile::index_t workspace_size =
+            moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
+        ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
+        if(workspace_size != 0 && clear_inside == false)
+            moe_sorting_ws.SetZero(); // note, clear here!!!!
+
+        moe_sorting_trait trait{
+            index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};
+
+        moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
+                              weights_dev.GetDeviceBuffer(),
+                              local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
+                                                   : nullptr,
+                              is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
+                              sorted_ids_dev.GetDeviceBuffer(),
+                              sorted_weights_dev.GetDeviceBuffer(),
+                              sorted_expert_ids_dev.GetDeviceBuffer(),
+                              sorted_id_cnt_dev.GetDeviceBuffer(),
+                              moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                              workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
+                              tokens,
+                              unit_size,
+                              num_experts,
+                              topk,
+#if MOE_SORTING_FMOE_2D_BUF
+                              moe_buf_interm_dim,
+                              moe_buf_elem_bytes
+#else
+                              static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
+#endif
+        };
+
+        ck_tile::stream_config sc{nullptr, false};
+
+        auto ret_val = moe_sorting(trait, karg, sc);
+
+        printf("[%s|%s|%s|%d]tokens:%d",
+               index_prec.c_str(),
+               weight_prec.c_str(),
+               workspace_size == 0 ? "cx" : (clear_inside ? "ci" : "co"),
+               dispatch_policy,
+               tokens);
+        if(is_local_token)
+        {
+            printf("(%d)", local_tokens);
+        }
+        printf(
+            ", num_experts:%d, topk:%d, mp:%d, ", num_experts, topk, workspace_size != 0 ? 1 : 0);
+
+        if(local_expert_masking)
+        {
+            printf("local_eid:");
+            print_vector(local_eid);
+        }
+
+        if(moe_buf_bytes > 0)
+        {
+#if MOE_SORTING_FMOE_2D_BUF
+            printf("moe_buf:%lu(%d,%d), ",
+                   static_cast<uint64_t>(moe_buf_bytes),
+                   moe_buf_interm_dim,
+                   moe_buf_elem_bytes);
+#else
+
+            printf("moe_buf:%lu, ", static_cast<uint64_t>(moe_buf_bytes));
+#endif
+        }
+
+        if(ret_val < 0)
+        {
+            printf("not supported\n");
+            fflush(stdout);
+            EXPECT_TRUE(false);
+        }
+
+        sorted_ids_dev.FromDevice(sorted_ids_host.data());
+        sorted_weights_dev.FromDevice(sorted_weights_host.data());
+        sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data());
+        sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data());
+        if(moe_buf_bytes > 0)
+        {
+            moe_buf_dev.FromDevice(moe_buf_host.data());
+        }
+
+        bool rtn = true;
+        ck_tile::HostTensor<IndexType> sorted_ids_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<WeightType> sorted_weights_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<IndexType> sorted_expert_ids_ref({max_output_ids / unit_size}, {1});
+
+        int32_t ref_total_tokens_post_pad = 0;
+        ck_tile::reference_moe_sorting<WeightType, IndexType>(topk_ids_host,
+                                                              weights_host,
+                                                              local_expert_masking_host,
+                                                              sorted_ids_ref,
+                                                              sorted_weights_ref,
+                                                              sorted_expert_ids_ref,
+                                                              ref_total_tokens_post_pad,
+                                                              num_experts,
+                                                              unit_size,
+                                                              is_local_token ? local_tokens
+                                                                             : tokens,
+                                                              local_expert_masking);
+        printf("total_tokens_post_pad:%d(%d), ",
+               ref_total_tokens_post_pad,
+               sorted_id_cnt_host.mData[0]);
+        if(ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0])
+        {
+            size_t slen = ref_total_tokens_post_pad;
+            rtn &= ck_tile::check_err(sorted_ids_host.slice({0}, {slen}),
+                                      sorted_ids_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect ids!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_weights_host.slice({0}, {slen}),
+                                      sorted_weights_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect w!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_expert_ids_host.slice({0}, {slen / unit_size}),
+                                      sorted_expert_ids_ref.slice({0}, {slen / unit_size}),
+                                      std::string("OUT Error: Incorrect eid!"),
+                                      1e-6,
+                                      1e-6);
+
+            auto t_ = is_local_token ? local_tokens : tokens;
+            bool _f = t_ == sorted_id_cnt_host.mData[1];
+            rtn &= _f;
+            if(!_f)
+            {
+                printf("not equal token buffer pad %d(%d)\n", t_, sorted_id_cnt_host.mData[1]);
+            }
+        }
+        else
+        {
+            printf("(token size not equal!!)");
+            rtn = false;
+        }
+
+        if(moe_buf_bytes)
+        {
+#if MOE_SORTING_FMOE_2D_BUF
+            ck_tile::HostTensor<int8_t> moe_buf_ref({moe_buf_bytes});
+#else
+            ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
+#endif
+            rtn &= ck_tile::check_err(
+                moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
+        }
+
+        printf("valid:%s", rtn ? "y" : "n");
+        fflush(stdout);
+        if(!rtn)
+            printf(", (%d)", seed);
+        printf("\n");
+        fflush(stdout);
+
+        EXPECT_TRUE(rtn);
+    }
+
+    template <typename PrecisionType>
+    static std::string get_precision_string()
+    {
+        if constexpr(std::is_same_v<PrecisionType, float>)
+        {
+            return "fp32";
+        }
+        else if(std::is_same_v<PrecisionType, ck_tile::index_t>)
+        {
+            return "int32";
+        }
+        else
+        {
+            throw std::runtime_error("Invalid precision.");
+        }
+    }
+};
diff --git a/test/ck_tile/permute/CMakeLists.txt b/test/ck_tile/permute/CMakeLists.txt
index 7ee55a984d..4256ad8de1 100644
--- a/test/ck_tile/permute/CMakeLists.txt
+++ b/test/ck_tile/permute/CMakeLists.txt
@@ -2,7 +2,7 @@
 if(GPU_TARGETS MATCHES "gfx9")
 
     function(add_permute_test TARGET_NAME MAIN_SRC)
-        add_test_executable(${TARGET_NAME} ${MAIN_SRC})
+        add_gtest_executable(${TARGET_NAME} ${MAIN_SRC})
 
         if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
         set(PERMUTE_USE_ALTERNATIVE_IMPL true)
@@ -10,23 +10,11 @@ if(GPU_TARGETS MATCHES "gfx9")
 
         if(PERMUTE_USE_ALTERNATIVE_IMPL)
         target_compile_options(${TARGET_NAME} PRIVATE -DPERMUTE_USE_ALTERNATIVE_IMPL)
-        target_sources(${TARGET_NAME} PRIVATE alternative_impl/matrix_core_swizzle.cpp)
         endif()
 
     endfunction(add_permute_test TARGET_NAME MAIN_SRC)
-    
-    set(CUSTOM_TARGET_NAME test_ck_tile_permute)
 
-    add_custom_target(${CUSTOM_TARGET_NAME})
-
-    add_permute_test(test_ck_tile_permute_fp16 permute_fp16.cpp)
-    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_permute_fp16)
-
-    add_permute_test(test_ck_tile_permute_fp8 permute_fp8.cpp)
-    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_permute_fp8)
-
-    add_permute_test(test_ck_tile_permute_fp32 permute_fp32.cpp)
-    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_permute_fp32)
+    add_permute_test(test_ck_tile_permute test_permute.cpp)
 
 else()
     message(DEBUG "Skipping ck_tile_permute tests for current target")
diff --git a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp
deleted file mode 100644
index aedcfac138..0000000000
--- a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "matrix_core_swizzle.hpp"
-#include "matrix_core_swizzle_kernel.hpp"
-
-float matrix_core_swizzle(matrix_core_swizzle_traits t,
-                          matrix_core_swizzle_args a,
-                          const ck_tile::stream_config& s)
-{
-    if(t.data_type.compare("fp16") == 0)
-    {
-        if(t.inst.compare("32x32x8") == 0)
-        {
-            constexpr int BLOCK_SIZE             = 256;
-            constexpr int NPerBlock              = 256;
-            constexpr int KPerBlock              = 128;
-            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_32x32x8_F16;
-            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-            else if(t.permute.compare("0,1,3,4,2,5") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-        }
-        else if(t.inst.compare("16x16x16") == 0)
-        {
-            constexpr int BLOCK_SIZE             = 256;
-            constexpr int NPerBlock              = 256;
-            constexpr int KPerBlock              = 128;
-            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_16x16x16_F16;
-            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-            else if(t.permute.compare("0,1,3,4,2,5") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-        }
-    }
-    return -1;
-}
diff --git a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp
index 89dfeda4af..021cc303ad 100644
--- a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp
+++ b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp
@@ -7,14 +7,125 @@
 
 struct matrix_core_swizzle_traits
 {
-    std::string data_type; // fp16 only
-    std::string inst;      // 32x32x8, 16x16x16
-    std::string permute;   //
+    std::string inst; // 32x32x8, 16x16x16
+    std::string permute;
 };
 
 using matrix_core_swizzle_args = matrix_core_swizzle_host_args;
 
 // host API
+template <typename DataType> // only supported with fp16 data type
 float matrix_core_swizzle(matrix_core_swizzle_traits,
                           matrix_core_swizzle_args,
                           const ck_tile::stream_config&);
+
+template <>
+float matrix_core_swizzle<ck_tile::half_t>(matrix_core_swizzle_traits t,
+                                           matrix_core_swizzle_args a,
+                                           const ck_tile::stream_config& s)
+{
+    if(t.inst.compare("32x32x8") == 0)
+    {
+        constexpr int BLOCK_SIZE             = 256;
+        constexpr int NPerBlock              = 256;
+        constexpr int KPerBlock              = 128;
+        constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_32x32x8_F16;
+        if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+        else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+        else if(t.permute.compare("0,1,3,4,2,5") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::b_nr_kr_kw_nw_kv;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+    }
+    else if(t.inst.compare("16x16x16") == 0)
+    {
+        constexpr int BLOCK_SIZE             = 256;
+        constexpr int NPerBlock              = 256;
+        constexpr int KPerBlock              = 128;
+        constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_16x16x16_F16;
+        if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+        else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+        else if(t.permute.compare("0,1,3,4,2,5") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::b_nr_kr_kw_nw_kv;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+    }
+
+    return -1;
+}
+
+template <>
+float matrix_core_swizzle<ck_tile::fp8_t>(matrix_core_swizzle_traits,
+                                          matrix_core_swizzle_args,
+                                          const ck_tile::stream_config&)
+{
+    throw std::runtime_error("Not supported for fp8");
+}
+
+template <>
+float matrix_core_swizzle<float>(matrix_core_swizzle_traits,
+                                 matrix_core_swizzle_args,
+                                 const ck_tile::stream_config&)
+{
+    throw std::runtime_error("Not supported for fp32");
+}
diff --git a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp
index c94adc24c3..498d93b656 100644
--- a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -1,5 +1,5 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -115,11 +115,12 @@ struct matrix_core_swizzle_kernel
 
     __host__ void operator()(const ck_tile::stream_config& s) const
     {
-        ck_tile::kentry<BLOCK_SIZE, 1, kernel><<<grids, BLOCK_SIZE, 0, s.stream_id_>>>(a);
+        ck_tile::kentry<1, kernel><<<grids, BLOCK_SIZE, 0, s.stream_id_>>>(a);
     }
 
     struct kernel
     {
+        static constexpr ck_tile::index_t kBlockSize = BLOCK_SIZE;
         __device__ static constexpr auto get_src_dist()
         {
             using namespace ck_tile;
diff --git a/test/ck_tile/permute/permute.hpp b/test/ck_tile/permute/permute.hpp
index 5724b0f316..83488a8c1b 100644
--- a/test/ck_tile/permute/permute.hpp
+++ b/test/ck_tile/permute/permute.hpp
@@ -8,12 +8,4 @@
 #include "ck_tile/ops/permute.hpp"
 #include <string>
 
-struct permute_traits
-{
-    std::string data_type;
-};
-
 using permute_args = ck_tile::GenericPermuteHostArgs;
-
-// host API
-float permute(permute_traits, permute_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/permute/permute_fp16.cpp b/test/ck_tile/permute/permute_fp16.cpp
deleted file mode 100644
index 24781261ef..0000000000
--- a/test/ck_tile/permute/permute_fp16.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "permute.hpp"
-#include "ck_tile/host.hpp"
-
-#include <array>
-#include <cassert>
-#include <cstring>
-#include <functional>
-#include <numeric>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
-#include "alternative_impl/matrix_core_swizzle.hpp"
-#endif
-
-#include "permute_utils.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases_fp16();
-
-    return !run_test_cases<ck_tile::half_t>(test_cases);
-}
diff --git a/test/ck_tile/permute/permute_fp32.cpp b/test/ck_tile/permute/permute_fp32.cpp
deleted file mode 100644
index 2ece7c20bb..0000000000
--- a/test/ck_tile/permute/permute_fp32.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "permute.hpp"
-#include "ck_tile/host.hpp"
-
-#include <array>
-#include <cassert>
-#include <cstring>
-#include <functional>
-#include <numeric>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
-#include "alternative_impl/matrix_core_swizzle.hpp"
-#endif
-
-#include "permute_utils.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases("fp32");
-
-    return !run_test_cases<float>(test_cases);
-}
diff --git a/test/ck_tile/permute/permute_fp8.cpp b/test/ck_tile/permute/permute_fp8.cpp
deleted file mode 100644
index e8ae5d0410..0000000000
--- a/test/ck_tile/permute/permute_fp8.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "permute.hpp"
-#include "ck_tile/host.hpp"
-
-#include <array>
-#include <cassert>
-#include <cstring>
-#include <functional>
-#include <numeric>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
-#include "alternative_impl/matrix_core_swizzle.hpp"
-#endif
-
-#include "permute_utils.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases("fp8");
-
-    return !run_test_cases<ck_tile::fp8_t>(test_cases);
-}
diff --git a/test/ck_tile/permute/permute_utils.inc b/test/ck_tile/permute/permute_utils.inc
deleted file mode 100644
index 6b8cb86b53..0000000000
--- a/test/ck_tile/permute/permute_utils.inc
+++ /dev/null
@@ -1,490 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#pragma once
-
-namespace detail {
-template <int bytes>
-struct to_integer_type;
-
-template <>
-struct to_integer_type<4>
-{
-    using type = int32_t;
-};
-template <>
-struct to_integer_type<2>
-{
-    using type = int16_t;
-};
-template <>
-struct to_integer_type<1>
-{
-    using type = int8_t;
-};
-} // namespace detail
-
-template <int bytes>
-using to_integer_type = typename detail::to_integer_type<bytes>::type;
-
-// host API (shoule come from codegen)
-float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s)
-{
-    if(t.data_type.compare("fp8") == 0)
-    {
-        using DataType        = ck_tile::fp8_t;
-        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
-        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
-
-        auto kargs = Kernel::MakeKargs(a);
-
-        const dim3 grids      = Kernel::GridSize(a);
-        constexpr dim3 blocks = Kernel::BlockSize();
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    }
-    else if(t.data_type.compare("fp16") == 0)
-    {
-        using DataType        = ck_tile::half_t;
-        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
-        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
-
-        auto kargs = Kernel::MakeKargs(a);
-
-        const dim3 grids      = Kernel::GridSize(a);
-        constexpr dim3 blocks = Kernel::BlockSize();
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    }
-    else if(t.data_type.compare("fp32") == 0)
-    {
-        using DataType        = float;
-        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
-        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
-
-        auto kargs = Kernel::MakeKargs(a);
-
-        const dim3 grids      = Kernel::GridSize(a);
-        constexpr dim3 blocks = Kernel::BlockSize();
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    }
-
-    return 0;
-}
-
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
-{
-    using size_type = typename std::vector<T>::size_type;
-
-    os << "[";
-    for(size_type idx = 0; idx < v.size(); ++idx)
-    {
-        if(0 < idx)
-        {
-            os << ", ";
-        }
-        os << v[idx];
-    }
-    return os << "]";
-}
-
-auto create_args(int argc, char* argv[], int start_index = 0)
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "weather do CPU validation or not")
-        .insert("prec", "fp16", "data type. fp8/fp16/fp32 (representing 8/16/32 bit data)")
-        .insert("shape", "2,3,4", "the shape of the input tensor")
-        .insert("perm", "2,1,0", "permute perm")
-        .insert("kname", "0", "t to 1 will print kernel name")
-        .insert("seed",
-                "11939",
-                "random seed used for initializing input tensors. 0 for "
-                "non-deterministic seed")
-        .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
-
-    bool result = arg_parser.parse(argc, argv, start_index);
-    return std::make_tuple(result, arg_parser);
-}
-
-// different threshold for different dtype
-template <typename DataType>
-auto get_elimit(std::string /*init_method*/)
-{
-    double rtol = 1e-3;
-    double atol = 1e-3;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
-{
-    double rtol = 1e-2;
-    double atol = 1e-2;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::fp8_t>(std::string init_method)
-{
-    if(init_method == "ui" || init_method == "ni")
-    {
-        unsigned max_rounding_point_distance = 0;
-        double atol                          = 2e-3;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
-    else
-    {
-        unsigned max_rounding_point_distance = 1;
-        double atol                          = 0.0625;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
-}
-
-// "1,2,3,4" -> vector{1,2,3,4}
-std::vector<ck_tile::index_t> decode_vec(std::string q_val)
-{
-#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
-    std::string::size_type pos = 0;
-    std::vector<ck_tile::index_t> v;
-    while(true)
-    {
-        auto found = q_val.find(',', pos);
-        ck_tile::index_t n =
-            _S2I_(q_val.substr(pos, found == std::string::npos ? found : found - pos));
-        v.push_back(n);
-        if(found == std::string::npos)
-        {
-            break;
-        }
-        pos = found + 1;
-    }
-    return v;
-#undef _S2I_
-}
-
-template <typename DataType>
-bool run(const ck_tile::ArgParser& arg_parser)
-{
-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-
-    auto shape        = decode_vec(arg_parser.get_str("shape"));
-    auto perm         = decode_vec(arg_parser.get_str("perm"));
-    int stream_warmup = arg_parser.get_int("warmup");
-    int stream_repeat = arg_parser.get_int("repeat");
-    bool kname        = arg_parser.get_bool("kname");
-    int seed          = arg_parser.get_int("seed");
-
-    assert(shape.size() == perm.size());
-    ck_tile::index_t rank = perm.size();
-    if(rank > ck_tile::GenericPermuteHostArgs::kMaxRanks)
-    {
-        printf("rank %d permute is not support yet\n", rank);
-        return false;
-    }
-
-    ck_tile::HostTensor<DataType> x(shape);
-    ck_tile::FillUniformDistributionIntegerValue<DataType>{-15, 15, seed}(x);
-
-    std::vector<ck_tile::index_t> y_shape = [&]() {
-        std::vector<ck_tile::index_t> tmp(rank, 0);
-        // std::cout << "@@@@" << tmp << std::endl;
-        for(int i = 0; i < static_cast<int>(rank); i++)
-        {
-            // std::cout << "  i:" << i << ", perm:" << perm[i] << ", rak:" <<
-            // static_cast<int>(rank)
-            // << std::endl;
-            tmp[i] = shape[perm[i]];
-        }
-        // std::cout << "@@@" << tmp << std::endl;
-        return tmp;
-    }();
-
-    ck_tile::HostTensor<DataType> y(y_shape);
-
-    ck_tile::DeviceMem x_buf(x.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem y_buf(y.get_element_space_size_in_bytes());
-
-    x_buf.ToDevice(x.data());
-
-    std::cout << "[" << data_type << "] shape:" << shape << "->" << y_shape << ", permute:" << perm
-              << std::endl;
-
-    ck_tile::stream_config stream_config{nullptr,
-                                         true,
-                                         /* log_level = */ (kname ? 1 : 0),
-                                         stream_warmup,
-                                         stream_repeat};
-    float ave_time   = 0.f;
-    auto run_permute = [&]() {
-        permute_traits t;
-        t.data_type = data_type;
-
-        permute_args a;
-        a.p_src = x_buf.GetDeviceBuffer();
-        a.p_dst = y_buf.GetDeviceBuffer();
-        a.rank  = rank;
-        std::copy(shape.begin(), shape.end(), a.shape);
-        std::copy(perm.begin(), perm.end(), a.perm);
-
-        return permute(t, a, stream_config);
-    };
-#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
-    // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
-    if((arg_parser.get_str("perm") == std::string("0,1,4,2,5,3,6") ||
-        arg_parser.get_str("perm") == std::string("0,1,2,4,5,3,6") ||
-        arg_parser.get_str("perm") == std::string("0,1,3,4,2,5")))
-    {
-        if(arg_parser.get_str("perm") == std::string("0,1,3,4,2,5"))
-        {
-            // b_nr_kr_kw_nw_kv = 2,   // 0,1,3,4,2,5
-            matrix_core_swizzle_traits t;
-            t.data_type = data_type;
-            t.permute   = arg_parser.get_str("perm");
-
-            matrix_core_swizzle_args a;
-            a.p_src = x_buf.GetDeviceBuffer();
-            a.p_dst = y_buf.GetDeviceBuffer();
-            a.batch = shape[0];
-
-            auto nr = shape[1];
-            auto nw = shape[2];
-            auto kr = shape[3];
-            auto kw = shape[4];
-            auto kv = shape[5];
-            a.n     = nr * nw;
-            a.k     = kr * kw * kv;
-            if(kv == 8 && kw == 4 && nw == 16 && nr % 4 == 0 && kr % 8 == 0)
-            {
-                t.inst = "16x16x16";
-                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
-
-                ave_time = matrix_core_swizzle(t, a, stream_config);
-            }
-            else if(kv == 8 && kw == 2 && nw == 32 && nr % 4 == 0 && kr % 8 == 0)
-            {
-                t.inst = "32x32x8";
-                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
-
-                ave_time = matrix_core_swizzle(t, a, stream_config);
-            }
-            else
-            {
-                ave_time = run_permute();
-            }
-        }
-        else
-        {
-            matrix_core_swizzle_traits t;
-            t.data_type = data_type;
-            t.permute   = arg_parser.get_str("perm");
-
-            matrix_core_swizzle_args a;
-            a.p_src = x_buf.GetDeviceBuffer();
-            a.p_dst = y_buf.GetDeviceBuffer();
-            a.batch = shape[0];
-            a.n     = shape[1] * shape[2] * shape[3];
-            a.k     = shape[4] * shape[5] * shape[6];
-            if(shape[6] == 8 && shape[3] == 32 && shape[5] == 2 && shape[2] == 4 &&
-               shape[4] % 8 == 0 && shape[1] % 2 == 0)
-            {
-                // 32x32x8 inst
-                // perm=0,1,4,2,5,3,6
-                // y_shape=*,2x,8x,4,2,32,8 (3,6,16,4,2,32,8)
-                // shape = *,2x,4,32,8x,2,8 (3,6,4,32,16,2,8)
-
-                t.inst = "32x32x8";
-                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
-
-                ave_time = matrix_core_swizzle(t, a, stream_config);
-            }
-            else if(shape[6] == 8 && shape[3] == 16 && shape[5] == 4 && shape[2] == 4 &&
-                    shape[4] % 4 == 0 && shape[1] % 4 == 0)
-            {
-                // 16x16x16 inst
-                // perm=0,1,4,2,5,3,6
-                // y_shape=*,4x,4x,4,4,16,8
-                // shape = *,4x,4,16,4x,4,8 (3,8,4,16,16,4,8)
-                t.inst = "16x16x16";
-                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
-
-                ave_time = matrix_core_swizzle(t, a, stream_config);
-            }
-            else
-            {
-                ave_time = run_permute();
-            }
-        }
-    }
-    else
-#endif
-    {
-        ave_time = run_permute();
-    }
-    std::cout << ", time:" << ave_time << "ms" << std::flush;
-
-    bool pass = true;
-    if(do_validation)
-    {
-        reference_permute(x, y, perm);
-
-        ck_tile::HostTensor<DataType> y_dev(y.get_lengths());
-
-        y_buf.FromDevice(y_dev.data());
-
-        pass = std::equal(
-            y_dev.begin(), y_dev.end(), y.begin(), [&](const DataType& d, const DataType& h) {
-                using itype = to_integer_type<sizeof(DataType)>;
-                itype i_d   = ck_tile::bit_cast<itype>(d);
-                itype i_h   = ck_tile::bit_cast<itype>(h);
-                return i_d == i_h;
-            });
-        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
-    }
-
-    std::cout << std::endl;
-
-    return pass;
-}
-
-template <typename DataType>
-bool run_test_case(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-
-    if(!result)
-        return false;
-
-    return run<DataType>(arg_parser);
-}
-
-template <typename DataType>
-bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
-{
-    bool valid             = true;
-    constexpr int num_args = 6;
-    char* argv[num_args];
-
-    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
-    {
-        assert(test_cases[test_idx].size() == num_args &&
-               "invalid number of arguments in test case");
-
-        for(int arg_idx = 0; arg_idx < num_args; ++arg_idx)
-        {
-            argv[arg_idx] = test_cases[test_idx][arg_idx].data();
-        }
-
-        valid = valid && run_test_case<DataType>(num_args, argv);
-
-        if(!valid)
-            break;
-    }
-
-    return valid;
-}
-
-std::vector<std::vector<std::string>> create_test_cases(const std::string prec)
-{
-    return {
-        {"-prec=" + prec, "-shape=3,8", "-perm=1,0", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=48,6,8", "-perm=2,1,0", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=24,128,3", "-perm=0,2,1", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=4,10,7,6", "-perm=0,2,3,1", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=8,24,36,10", "-perm=3,1,2,0", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=8,1,36,4", "-perm=2,1,0,3", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec,
-         "-shape=5,10,16,2,36,4",
-         "-perm=4,5,2,1,0,3",
-         "-v=1",
-         "-warmup=0",
-         "-repeat=1"},
-        {"-prec=" + prec,
-         "-shape=2,32,8,3,6,2,5,4",
-         "-perm=5,2,4,7,1,6,3,0",
-         "-v=1",
-         "-warmup=0",
-         "-repeat=1"}};
-}
-
-std::vector<std::vector<std::string>> create_test_cases_fp16()
-{
-    return {{"-prec=fp16",
-             "-shape=3,6,4,32,16,2,8",
-             "-perm=0,1,4,2,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=5,10,4,32,8,2,8",
-             "-perm=0,1,4,2,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=3,8,4,16,16,4,8",
-             "-perm=0,1,4,2,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=3,6,4,32,16,2,8",
-             "-perm=0,1,2,4,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=5,10,4,32,8,2,8",
-             "-perm=0,1,2,4,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=3,8,4,16,16,4,8",
-             "-perm=0,1,2,4,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=2,8,16,8,4,8",
-             "-perm=0,1,3,4,2,5",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=1,24,32,16,2,8",
-             "-perm=0,1,3,4,2,5",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16", "-shape=3,8", "-perm=1,0", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=48,6,8", "-perm=2,1,0", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=24,128,3", "-perm=0,2,1", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=4,10,7,6", "-perm=0,2,3,1", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=8,24,36,10", "-perm=3,1,2,0", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=8,1,36,4", "-perm=2,1,0,3", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=5,10,16,2,36,4",
-             "-perm=4,5,2,1,0,3",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=2,32,8,3,6,2,5,4",
-             "-perm=5,2,4,7,1,6,3,0",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"}};
-}
diff --git a/test/ck_tile/permute/test_permute.cpp b/test/ck_tile/permute/test_permute.cpp
new file mode 100644
index 0000000000..3a2bcecf58
--- /dev/null
+++ b/test/ck_tile/permute/test_permute.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_permute_types.hpp"
+#include "test_permute_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTilePermute
+
+TYPED_TEST_SUITE(TestCkTilePermute, KernelTypesPermute);
+
+#include "test_permute_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/permute/test_permute_cases.inc b/test/ck_tile/permute/test_permute_cases.inc
new file mode 100755
index 0000000000..e596bfc721
--- /dev/null
+++ b/test/ck_tile/permute/test_permute_cases.inc
@@ -0,0 +1,279 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#ifndef TEST_PERMUTE_CASES_INC
+#define TEST_PERMUTE_CASES_INC
+TYPED_TEST(TEST_SUITE_NAME, Permute1)
+{
+    std::vector<ck_tile::index_t> shape{3, 8};
+    std::string perm{"1,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute2)
+{
+    std::vector<ck_tile::index_t> shape{48, 6, 8};
+    std::string perm{"2,1,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute3)
+{
+    std::vector<ck_tile::index_t> shape{24, 128, 3};
+    std::string perm{"0,2,1"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute4)
+{
+    std::vector<ck_tile::index_t> shape{4, 10, 7, 6};
+    std::string perm{"0,2,3,1"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute5)
+{
+    std::vector<ck_tile::index_t> shape{8, 24, 36, 10};
+    std::string perm{"3,1,2,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute6)
+{
+    std::vector<ck_tile::index_t> shape{8, 1, 36, 4};
+    std::string perm{"2,1,0,3"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute7)
+{
+    std::vector<ck_tile::index_t> shape{5, 10, 16, 2, 36, 4};
+    std::string perm{"4,5,2,1,0,3"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute8)
+{
+    std::vector<ck_tile::index_t> shape{2, 32, 8, 3, 6, 2, 5, 4};
+    std::string perm{"5,2,4,7,1,6,3,0"};
+
+    this->Run(shape, perm);
+}
+TYPED_TEST(TEST_SUITE_NAME, Permute9)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 6, 4, 32, 16, 2, 8};
+    std::string perm{"0,1,4,2,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute10)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{5, 10, 4, 32, 8, 2, 8};
+    std::string perm{"0,1,4,2,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute11)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 8, 4, 16, 16, 4, 8};
+    std::string perm{"0,1,4,2,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute12)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 6, 4, 32, 16, 2, 8};
+    std::string perm{"0,1,2,4,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute13)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{5, 10, 4, 32, 8, 2, 8};
+    std::string perm{"0,1,2,4,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute14)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 8, 4, 16, 16, 4, 8};
+    std::string perm{"0,1,2,4,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute15)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{2, 8, 16, 8, 4, 8};
+    std::string perm{"0,1,3,4,2,5"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute16)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{1, 24, 32, 16, 2, 8};
+    std::string perm{"0,1,3,4,2,5"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute17)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 8};
+    std::string perm{"1,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute18)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{48, 6, 8};
+    std::string perm{"2,1,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute19)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{24, 128, 3};
+    std::string perm{"0,2,1"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute20)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{4, 10, 7, 6};
+    std::string perm{"0,2,3,1"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute21)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{8, 24, 36, 10};
+    std::string perm{"3,1,2,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute22)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{8, 1, 36, 4};
+    std::string perm{"2,1,0,3"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute23)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{5, 10, 16, 2, 36, 4};
+    std::string perm{"4,5,2,1,0,3"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute24)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{2, 32, 8, 3, 6, 2, 5, 4};
+    std::string perm{"5,2,4,7,1,6,3,0"};
+
+    this->Run(shape, perm);
+}
+
+#endif
diff --git a/test/ck_tile/permute/test_permute_types.hpp b/test/ck_tile/permute/test_permute_types.hpp
new file mode 100644
index 0000000000..412e1e14ba
--- /dev/null
+++ b/test/ck_tile/permute/test_permute_types.hpp
@@ -0,0 +1,10 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include "ck_tile/host.hpp"
+#include "gtest/gtest.h"
+
+using F16Types = std::tuple<ck_tile::fp16_t>;
+using KernelTypesPermute =
+    ::testing::Types<F16Types, std::tuple<float>, std::tuple<ck_tile::fp8_t>>;
diff --git a/test/ck_tile/permute/test_permute_util.hpp b/test/ck_tile/permute/test_permute_util.hpp
new file mode 100644
index 0000000000..5494749541
--- /dev/null
+++ b/test/ck_tile/permute/test_permute_util.hpp
@@ -0,0 +1,328 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+#include "permute.hpp"
+#include "ck_tile/host.hpp"
+
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+#include "alternative_impl/matrix_core_swizzle.hpp"
+#endif
+
+namespace detail {
+template <int bytes>
+struct to_integer_type;
+
+template <>
+struct to_integer_type<4>
+{
+    using type = int32_t;
+};
+template <>
+struct to_integer_type<2>
+{
+    using type = int16_t;
+};
+template <>
+struct to_integer_type<1>
+{
+    using type = int8_t;
+};
+} // namespace detail
+
+template <int bytes>
+using to_integer_type = typename detail::to_integer_type<bytes>::type;
+
+// host API (should come from codegen)
+template <typename DataType>
+float permute(permute_args a, const ck_tile::stream_config& s)
+{
+    using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+    using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+    auto kargs = Kernel::MakeKargs(a);
+
+    const dim3 grids  = Kernel::GridSize(a);
+    const dim3 blocks = Kernel::BlockSize();
+
+    float ave_time =
+        ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs));
+
+    return ave_time;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    using size_type = typename std::vector<T>::size_type;
+
+    os << "[";
+    for(size_type idx = 0; idx < v.size(); ++idx)
+    {
+        if(0 < idx)
+        {
+            os << ", ";
+        }
+        os << v[idx];
+    }
+    return os << "]";
+}
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+// "1,2,3,4" -> vector{1,2,3,4}
+std::vector<ck_tile::index_t> decode_vec(std::string q_val)
+{
+#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
+    std::string::size_type pos = 0;
+    std::vector<ck_tile::index_t> v;
+    while(true)
+    {
+        auto found = q_val.find(',', pos);
+        ck_tile::index_t n =
+            _S2I_(q_val.substr(pos, found == std::string::npos ? found : found - pos));
+        v.push_back(n);
+        if(found == std::string::npos)
+        {
+            break;
+        }
+        pos = found + 1;
+    }
+    return v;
+#undef _S2I_
+}
+
+template <typename Tuple>
+class TestCkTilePermute : public ::testing::Test
+{
+
+    protected:
+    using DataType = std::tuple_element_t<0, Tuple>;
+
+    void Run(std::vector<ck_tile::index_t>& shape, std::string& perm)
+    {
+        std::string data_type                  = get_precision_string();
+        std::vector<ck_tile::index_t> perm_vec = decode_vec(perm);
+        int seed                               = 11939;
+
+        assert(shape.size() == perm_vec.size());
+        ck_tile::index_t rank = perm_vec.size();
+        if(rank > ck_tile::GenericPermuteHostArgs::kMaxRanks)
+        {
+            printf("rank %d permute is not support yet\n", rank);
+            EXPECT_TRUE(false);
+        }
+
+        ck_tile::HostTensor<DataType> x(shape);
+        ck_tile::FillUniformDistributionIntegerValue<DataType>{-15, 15, seed}(x);
+
+        std::vector<ck_tile::index_t> y_shape = [&]() {
+            std::vector<ck_tile::index_t> tmp(rank, 0);
+
+            for(int i = 0; i < static_cast<int>(rank); i++)
+            {
+                tmp[i] = shape[perm_vec[i]];
+            }
+
+            return tmp;
+        }();
+
+        ck_tile::HostTensor<DataType> y(y_shape);
+
+        ck_tile::DeviceMem x_buf(x.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem y_buf(y.get_element_space_size_in_bytes());
+
+        x_buf.ToDevice(x.data());
+
+        std::cout << "[" << data_type << "] shape:" << shape << "->" << y_shape
+                  << ", permute:" << perm_vec << std::endl;
+
+        ck_tile::stream_config stream_config{nullptr, false, 0, 0, 1};
+
+        auto run_permute = [&]() {
+            permute_args a;
+            a.p_src = x_buf.GetDeviceBuffer();
+            a.p_dst = y_buf.GetDeviceBuffer();
+            a.rank  = rank;
+            std::copy(shape.begin(), shape.end(), a.shape);
+            std::copy(perm_vec.begin(), perm_vec.end(), a.perm);
+
+            return permute<DataType>(a, stream_config);
+        };
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+        // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+        if((perm == std::string("0,1,4,2,5,3,6") || perm == std::string("0,1,2,4,5,3,6") ||
+            perm == std::string("0,1,3,4,2,5")))
+        {
+            if(perm == std::string("0,1,3,4,2,5"))
+            {
+                // b_nr_kr_kw_nw_kv = 2,   // 0,1,3,4,2,5
+                matrix_core_swizzle_traits t;
+                t.permute = perm;
+
+                matrix_core_swizzle_args a;
+                a.p_src = x_buf.GetDeviceBuffer();
+                a.p_dst = y_buf.GetDeviceBuffer();
+                a.batch = shape[0];
+
+                auto nr = shape[1];
+                auto nw = shape[2];
+                auto kr = shape[3];
+                auto kw = shape[4];
+                auto kv = shape[5];
+                a.n     = nr * nw;
+                a.k     = kr * kw * kv;
+                if(kv == 8 && kw == 4 && nw == 16 && nr % 4 == 0 && kr % 8 == 0)
+                {
+                    t.inst = "16x16x16";
+                    std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                    matrix_core_swizzle<DataType>(t, a, stream_config);
+                }
+                else if(kv == 8 && kw == 2 && nw == 32 && nr % 4 == 0 && kr % 8 == 0)
+                {
+                    t.inst = "32x32x8";
+                    std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                    matrix_core_swizzle<DataType>(t, a, stream_config);
+                }
+                else
+                {
+                    run_permute();
+                }
+            }
+            else
+            {
+                matrix_core_swizzle_traits t;
+                t.permute = perm;
+
+                matrix_core_swizzle_args a;
+                a.p_src = x_buf.GetDeviceBuffer();
+                a.p_dst = y_buf.GetDeviceBuffer();
+                a.batch = shape[0];
+                a.n     = shape[1] * shape[2] * shape[3];
+                a.k     = shape[4] * shape[5] * shape[6];
+                if(shape[6] == 8 && shape[3] == 32 && shape[5] == 2 && shape[2] == 4 &&
+                   shape[4] % 8 == 0 && shape[1] % 2 == 0)
+                {
+                    // 32x32x8 inst
+                    // perm=0,1,4,2,5,3,6
+                    // y_shape=*,2x,8x,4,2,32,8 (3,6,16,4,2,32,8)
+                    // shape = *,2x,4,32,8x,2,8 (3,6,4,32,16,2,8)
+
+                    t.inst = "32x32x8";
+                    std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                    matrix_core_swizzle<DataType>(t, a, stream_config);
+                }
+                else if(shape[6] == 8 && shape[3] == 16 && shape[5] == 4 && shape[2] == 4 &&
+                        shape[4] % 4 == 0 && shape[1] % 4 == 0)
+                {
+                    // 16x16x16 inst
+                    // perm=0,1,4,2,5,3,6
+                    // y_shape=*,4x,4x,4,4,16,8
+                    // shape = *,4x,4,16,4x,4,8 (3,8,4,16,16,4,8)
+                    t.inst = "16x16x16";
+                    std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                    matrix_core_swizzle<DataType>(t, a, stream_config);
+                }
+                else
+                {
+                    run_permute();
+                }
+            }
+        }
+        else
+#endif
+        {
+            run_permute();
+        }
+
+        bool pass = true;
+
+        // Do Validation
+        reference_permute(x, y, perm_vec);
+
+        ck_tile::HostTensor<DataType> y_dev(y.get_lengths());
+
+        y_buf.FromDevice(y_dev.data());
+
+        pass = std::equal(
+            y_dev.begin(), y_dev.end(), y.begin(), [&](const DataType& d, const DataType& h) {
+                using itype = to_integer_type<sizeof(DataType)>;
+                itype i_d   = ck_tile::bit_cast<itype>(d);
+                itype i_h   = ck_tile::bit_cast<itype>(h);
+                return i_d == i_h;
+            });
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
+
+        std::cout << std::endl;
+
+        EXPECT_TRUE(pass);
+    }
+
+    static std::string get_precision_string()
+    {
+        if constexpr(std::is_same_v<DataType, ck_tile::fp16_t>)
+        {
+            return "fp16";
+        }
+        else if(std::is_same_v<DataType, ck_tile::fp8_t>)
+        {
+            return "fp8";
+        }
+        else if(std::is_same_v<DataType, float>)
+        {
+            return "fp32";
+        }
+        else
+        {
+            throw std::runtime_error("invalid precision");
+        }
+    }
+};
diff --git a/test/ck_tile/reduce/test_reduce2d.cpp b/test/ck_tile/reduce/test_reduce2d.cpp
index 821d0a6c3e..ff807e52c9 100644
--- a/test/ck_tile/reduce/test_reduce2d.cpp
+++ b/test/ck_tile/reduce/test_reduce2d.cpp
@@ -82,18 +82,18 @@ class TestCkTileReduce : public ::testing::Test
             throw std::runtime_error("Wrong! Arguments not supported!\n");
         }
 
-        ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0},
-                               ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                   Kernel{},
-                                   kGridSize,
-                                   kBlockSize,
-                                   0,
-                                   static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
-                                   static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
-                                   input_shape_tuple,
-                                   input_strides_tuple,
-                                   kept_dims,
-                                   reduce_dims));
+        ck_tile::launch_kernel(
+            ck_tile::stream_config{nullptr, false, 0},
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{},
+                                              kGridSize,
+                                              kBlockSize,
+                                              0,
+                                              static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
+                                              static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
+                                              input_shape_tuple,
+                                              input_strides_tuple,
+                                              kept_dims,
+                                              reduce_dims));
 
         // Get results back
         d_y_mem.FromDevice(h_y.data());
diff --git a/test/ck_tile/rmsnorm2d/generate.py b/test/ck_tile/rmsnorm2d/generate.py
index 4296b7373e..1a1c842b3c 100644
--- a/test/ck_tile/rmsnorm2d/generate.py
+++ b/test/ck_tile/rmsnorm2d/generate.py
@@ -246,7 +246,7 @@ float rmsnorm2d_fwd_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
 }}
 
 """
diff --git a/test/ck_tile/smoothquant/CMakeLists.txt b/test/ck_tile/smoothquant/CMakeLists.txt
index de4459051c..548fc03a41 100644
--- a/test/ck_tile/smoothquant/CMakeLists.txt
+++ b/test/ck_tile/smoothquant/CMakeLists.txt
@@ -3,7 +3,7 @@ if(GPU_TARGETS MATCHES "gfx9")
     function (add_smoothquant_test TARGET_NAME MAIN_SRC)
         message(DEBUG "adding ${TARGET_NAME}")
 
-        add_test_executable(${TARGET_NAME} ${MAIN_SRC})
+        add_gtest_executable(${TARGET_NAME} ${MAIN_SRC})
         target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 
         foreach(source IN LISTS ARGN)
@@ -20,8 +20,7 @@ if(GPU_TARGETS MATCHES "gfx9")
     endfunction(add_smoothquant_test TARGET_NAME MAIN_SRC)
 
     file(GLOB INSTANCE_SRCS instances/*.cpp)
-    add_smoothquant_test(test_ck_tile_smoothquant_fp16 smoothquant_fp16.cpp ${INSTANCE_SRCS})
-    add_smoothquant_test(test_ck_tile_smoothquant_bf16 smoothquant_bf16.cpp ${INSTANCE_SRCS})
+    add_smoothquant_test(test_ck_tile_smoothquant test_smoothquant.cpp ${INSTANCE_SRCS})
 
 else()
     message(DEBUG "Skipping ck_tile smoothquant tests for current target")
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp
index 4b7ef5a38d..04e6732a7e 100644
--- a/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp
@@ -22,9 +22,7 @@ using trait_ = smoothquant_traits_<DataType_,
                                    kTwoPass_>;
 
 template <typename data_type>
-float smoothquant_dispatch(smoothquant_traits /*t*/,
-                           smoothquant_args a,
-                           const ck_tile::stream_config& s)
+float smoothquant_dispatch(smoothquant_args a, const ck_tile::stream_config& s)
 {
     float r = -1;
     // clang-format off
@@ -128,16 +126,14 @@ float smoothquant_dispatch(smoothquant_traits /*t*/,
     // clang-format on
 }
 
-float smoothquant(smoothquant_traits t, smoothquant_args a, const ck_tile::stream_config& s)
+template <>
+float smoothquant<ck_tile::fp16_t>(smoothquant_args a, const ck_tile::stream_config& s)
 {
-    if(t.data_type.compare("fp16") == 0)
-    {
-        return smoothquant_dispatch<ck_tile::fp16_t>(t, a, s);
-    }
-    else if(t.data_type.compare("bf16") == 0)
-    {
-        return smoothquant_dispatch<ck_tile::bf16_t>(t, a, s);
-    }
-    else
-        throw std::runtime_error("Without supported instances!");
+    return smoothquant_dispatch<ck_tile::fp16_t>(a, s);
+}
+
+template <>
+float smoothquant<ck_tile::bf16_t>(smoothquant_args a, const ck_tile::stream_config& s)
+{
+    return smoothquant_dispatch<ck_tile::bf16_t>(a, s);
 }
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp b/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp
index 19310beb94..8929289cdb 100644
--- a/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp
+++ b/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp
@@ -57,5 +57,5 @@ float smoothquant_(const S& s, A a)
         std::cout << ", " << Kernel::GetName() << std::flush;
 
     return ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 }
diff --git a/test/ck_tile/smoothquant/smoothquant.hpp b/test/ck_tile/smoothquant/smoothquant.hpp
index ce9ab25448..b1d5dae3d3 100644
--- a/test/ck_tile/smoothquant/smoothquant.hpp
+++ b/test/ck_tile/smoothquant/smoothquant.hpp
@@ -111,4 +111,5 @@ struct smoothquant_traits
     std::string data_type;
 };
 
-float smoothquant(smoothquant_traits, smoothquant_args, const ck_tile::stream_config&);
+template <typename DataType>
+float smoothquant(smoothquant_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/smoothquant/smoothquant.inc b/test/ck_tile/smoothquant/smoothquant.inc
deleted file mode 100644
index 23dba27e88..0000000000
--- a/test/ck_tile/smoothquant/smoothquant.inc
+++ /dev/null
@@ -1,273 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "ck_tile/host.hpp"
-#include "smoothquant.hpp"
-#include <cstring>
-
-// different threshold for different dtype
-template <typename DataType>
-auto get_elimit()
-{
-    double rtol = 1e-5;
-    double atol = 1e-5;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::bf16_t>()
-{
-    double rtol = 1e-5;
-    double atol = 1e-5;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::int8_t>()
-{
-    // due to rounding, int8 quantization might have 1 abs error
-    double rtol = 1;
-    double atol = 1;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-auto create_args(int argc, char* argv[], int index = 0)
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3328", "m dimension")
-        .insert("n", "4096", "n dimension")
-        .insert("x_stride", "-1", "input stride per row, if -1 then equal to n")
-        .insert("y_stride", "-1", "output stride per row, if -1 then equal to n")
-        .insert("v", "1", "cpu validation or not")
-        .insert("kname", "1", "print kernel name or not")
-        .insert("prec", "fp16", "precision")
-        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
-
-    bool result = arg_parser.parse(argc, argv, index);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename DataType>
-bool run(const ck_tile::ArgParser& arg_parser)
-{
-    ck_tile::index_t m        = arg_parser.get_int("m");
-    ck_tile::index_t n        = arg_parser.get_int("n");
-    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
-    if(x_stride < 0)
-        x_stride = n;
-    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
-    if(y_stride < 0)
-        y_stride = n;
-    std::string data_type = arg_parser.get_str("prec");
-    int kname             = arg_parser.get_int("kname");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
-
-    assert(x_stride >= n);
-
-    using TypeConfig = SmoothquantTypeConfig<DataType>;
-
-    using XDataType           = typename TypeConfig::XDataType;
-    using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
-    using YScaleDataType      = typename TypeConfig::YScaleDataType;
-    using QYDataType          = typename TypeConfig::QYDataType;
-    using ComputeDataType     = typename TypeConfig::ComputeDataType;
-
-    // host verify
-    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
-    ck_tile::HostTensor<SmoothScaleDataType> smscale_host({n});
-
-    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
-    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
-
-    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {y_stride, 1});
-    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {y_stride, 1});
-
-    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
-    ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
-
-    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
-
-    x_buf.ToDevice(x_host.data());
-    smscale_buf.ToDevice(smscale_host.data());
-
-    std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
-              << ", y_stride:" << y_stride << std::flush;
-
-    smoothquant_traits traits{data_type};
-
-    smoothquant_args args{x_buf.GetDeviceBuffer(),
-                          smscale_buf.GetDeviceBuffer(),
-                          yscale_buf.GetDeviceBuffer(),
-                          qy_buf.GetDeviceBuffer(),
-                          m,
-                          n,
-                          x_stride,
-                          y_stride};
-
-    float ave_time = smoothquant(
-        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
-
-    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(SmoothScaleDataType) * n +
-                           sizeof(YScaleDataType) * m + sizeof(QYDataType) * m * n;
-
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
-
-    bool pass = true;
-
-    if(do_validation)
-    {
-        using YDataType = ComputeDataType;
-        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {y_stride, 1});
-        // smooth outlier
-        {
-            auto f = [&](auto n_) {
-                auto v_smscale = ck_tile::type_convert<ComputeDataType>(smscale_host(n_));
-
-                for(int m_ = 0; m_ < m; ++m_)
-                {
-                    auto v_x       = ck_tile::type_convert<ComputeDataType>(x_host(m_, n_));
-                    y_host(m_, n_) = v_x * v_smscale;
-                }
-            };
-
-            ck_tile::make_ParallelTensorFunctor(f, smscale_host.get_element_space_size())(
-                std::thread::hardware_concurrency());
-        }
-
-        // yscale
-        {
-            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
-
-            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
-            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
-                y_host, y_rowwise_amax_host, ReduceAmax{});
-
-            auto op = [](const auto& v0) {
-                return v0 /
-                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
-            };
-            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
-                y_rowwise_amax_host, yscale_host_ref, op);
-
-            yscale_buf.FromDevice(yscale_host_dev.mData.data());
-
-            auto [rtol, atol] = get_elimit<YScaleDataType>();
-            pass &= ck_tile::check_err(yscale_host_dev,
-                                       yscale_host_ref,
-                                       std::string("yscale Error: Incorrect results!"),
-                                       rtol,
-                                       atol);
-        }
-
-        // rowwise quantization
-        {
-            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
-                y_host, yscale_host_ref, qy_host_ref);
-
-            qy_buf.FromDevice(qy_host_dev.data());
-            auto [rtol, atol] = get_elimit<QYDataType>();
-
-            if(y_stride == n)
-            {
-                pass = ck_tile::check_err(qy_host_dev,
-                                          qy_host_ref,
-                                          std::string("qy Error: Incorrect results!"),
-                                          rtol,
-                                          atol);
-            }
-            else
-            {
-                for(int i_r = 0; i_r < m; i_r++)
-                {
-                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride,
-                                                            qy_host_dev.begin() + i_r * y_stride +
-                                                                n);
-                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride,
-                                                            qy_host_ref.begin() + i_r * y_stride +
-                                                                n);
-                    pass &= ck_tile::check_err(qy_host_dev_row,
-                                               qy_host_ref_row,
-                                               std::string("qy[") + std::to_string(i_r) +
-                                                   std::string("] Error: Incorrect results!"),
-                                               rtol,
-                                               atol);
-                }
-            }
-        }
-
-        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
-    }
-
-    return pass;
-}
-
-std::vector<std::vector<std::string>> create_test_cases(const std::string prec)
-{
-    return {{"-prec=" + prec, "-m=99", "-n=13", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=17", "-n=16", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=1", "-n=100", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=4", "-n=128", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=80", "-n=127", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=22", "-n=255", "-x_stride=256"},
-            {"-prec=" + prec, "-m=7", "-n=599", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=19", "-n=512", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=33", "-n=313", "-x_stride=1000"},
-            {"-prec=" + prec, "-m=11", "-n=510", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=171", "-n=676", "-x_stride=818"},
-            {"-prec=" + prec, "-m=91", "-n=636", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=12", "-n=768", "-x_stride=800"},
-            {"-prec=" + prec, "-m=100", "-n=766", "-x_stride=812"},
-            {"-prec=" + prec, "-m=31", "-n=1024", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=64", "-n=1000", "-x_stride=1004"},
-            {"-prec=" + prec, "-m=8", "-n=1501", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=3", "-n=1826", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=5", "-n=2040", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=7", "-n=2734", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=1", "-n=3182", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=9", "-n=4096", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=3", "-n=8192", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=1", "-n=10547", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=3", "-n=17134", "-x_stride=-1"}};
-}
-
-template <typename DataType>
-bool run_test_case(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return false;
-
-    return run<DataType>(arg_parser);
-}
-
-template <typename DataType>
-bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
-{
-    bool valid             = true;
-    constexpr int num_args = 4;
-
-    char* argv[num_args];
-
-    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
-    {
-        assert(test_cases[test_idx].size() == num_args &&
-               "invalid number of arguments in test case");
-        for(std::size_t idx = 0; idx < num_args; ++idx)
-        {
-            argv[idx] = test_cases[test_idx][idx].data();
-        }
-        valid = valid && run_test_case<DataType>(num_args, argv);
-
-        if(!valid)
-            break;
-    }
-
-    return valid;
-}
diff --git a/test/ck_tile/smoothquant/smoothquant_bf16.cpp b/test/ck_tile/smoothquant/smoothquant_bf16.cpp
deleted file mode 100644
index 4f5a8ac63e..0000000000
--- a/test/ck_tile/smoothquant/smoothquant_bf16.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases("bf16");
-
-    return !run_test_cases<ck_tile::bf16_t>(test_cases);
-}
diff --git a/test/ck_tile/smoothquant/smoothquant_fp16.cpp b/test/ck_tile/smoothquant/smoothquant_fp16.cpp
deleted file mode 100644
index 7d822b4903..0000000000
--- a/test/ck_tile/smoothquant/smoothquant_fp16.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases("fp16");
-
-    return !run_test_cases<ck_tile::half_t>(test_cases);
-}
diff --git a/test/ck_tile/smoothquant/test_smoothquant.cpp b/test/ck_tile/smoothquant/test_smoothquant.cpp
new file mode 100644
index 0000000000..6cce425e1b
--- /dev/null
+++ b/test/ck_tile/smoothquant/test_smoothquant.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_smoothquant_types.hpp"
+#include "test_smoothquant_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTileSmoothquant
+
+TYPED_TEST_SUITE(TestCkTileSmoothquant, KernelTypesSmoothquant);
+
+#include "test_smoothquant_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/smoothquant/test_smoothquant_cases.inc b/test/ck_tile/smoothquant/test_smoothquant_cases.inc
new file mode 100755
index 0000000000..27a7ea4676
--- /dev/null
+++ b/test/ck_tile/smoothquant/test_smoothquant_cases.inc
@@ -0,0 +1,206 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#ifndef TEST_SMOOTHQUANT_CASES_INC
+#define TEST_SMOOTHQUANT_CASES_INC
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m99_n13)
+{
+    ck_tile::index_t m = 99;
+    ck_tile::index_t n = 13;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m17_n16)
+{
+    ck_tile::index_t m = 17;
+    ck_tile::index_t n = 16;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m1_n100)
+{
+    ck_tile::index_t m = 1;
+    ck_tile::index_t n = 100;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m4_n128)
+{
+    ck_tile::index_t m = 4;
+    ck_tile::index_t n = 128;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m80_n127)
+{
+    ck_tile::index_t m = 80;
+    ck_tile::index_t n = 127;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m22_n255)
+{
+    ck_tile::index_t m        = 22;
+    ck_tile::index_t n        = 255;
+    ck_tile::index_t x_stride = 256;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m7_n599)
+{
+    ck_tile::index_t m = 7;
+    ck_tile::index_t n = 599;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m33_n313)
+{
+    ck_tile::index_t m        = 33;
+    ck_tile::index_t n        = 313;
+    ck_tile::index_t x_stride = 1000;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m11_n510)
+{
+    ck_tile::index_t m = 11;
+    ck_tile::index_t n = 510;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m171_n676)
+{
+    ck_tile::index_t m        = 171;
+    ck_tile::index_t n        = 676;
+    ck_tile::index_t x_stride = 818;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m91_n636)
+{
+    ck_tile::index_t m = 91;
+    ck_tile::index_t n = 636;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m12_n768)
+{
+    ck_tile::index_t m        = 12;
+    ck_tile::index_t n        = 768;
+    ck_tile::index_t x_stride = 800;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m100_n766)
+{
+    ck_tile::index_t m        = 100;
+    ck_tile::index_t n        = 766;
+    ck_tile::index_t x_stride = 812;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m31_n1024)
+{
+    ck_tile::index_t m = 31;
+    ck_tile::index_t n = 1024;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m64_n1000)
+{
+    ck_tile::index_t m        = 64;
+    ck_tile::index_t n        = 1000;
+    ck_tile::index_t x_stride = 1004;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m8_n1501)
+{
+    ck_tile::index_t m = 8;
+    ck_tile::index_t n = 1501;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m3_n1826)
+{
+    ck_tile::index_t m = 3;
+    ck_tile::index_t n = 1826;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m5_n2040)
+{
+    ck_tile::index_t m = 5;
+    ck_tile::index_t n = 2040;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m7_n2734)
+{
+    ck_tile::index_t m = 7;
+    ck_tile::index_t n = 2734;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m1_n3182)
+{
+    ck_tile::index_t m = 1;
+    ck_tile::index_t n = 3182;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m9_n4096)
+{
+    ck_tile::index_t m = 9;
+    ck_tile::index_t n = 4096;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m3_n8192)
+{
+    ck_tile::index_t m = 3;
+    ck_tile::index_t n = 8192;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m1_n10547)
+{
+    ck_tile::index_t m = 1;
+    ck_tile::index_t n = 10547;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m3_n17134)
+{
+    ck_tile::index_t m = 3;
+    ck_tile::index_t n = 17134;
+
+    this->Run(m, n);
+}
+
+#endif
diff --git a/test/ck_tile/smoothquant/test_smoothquant_types.hpp b/test/ck_tile/smoothquant/test_smoothquant_types.hpp
new file mode 100644
index 0000000000..7f79ce3ff9
--- /dev/null
+++ b/test/ck_tile/smoothquant/test_smoothquant_types.hpp
@@ -0,0 +1,9 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include "ck_tile/host.hpp"
+#include "gtest/gtest.h"
+
+using KernelTypesSmoothquant =
+    ::testing::Types<std::tuple<ck_tile::fp16_t>, std::tuple<ck_tile::bf16_t>>;
diff --git a/test/ck_tile/smoothquant/test_smoothquant_util.hpp b/test/ck_tile/smoothquant/test_smoothquant_util.hpp
new file mode 100644
index 0000000000..5c1b733e03
--- /dev/null
+++ b/test/ck_tile/smoothquant/test_smoothquant_util.hpp
@@ -0,0 +1,181 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "ck_tile/host.hpp"
+#include "smoothquant.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <typename Tuple>
+class TestCkTileSmoothquant : public ::testing::Test
+{
+
+    protected:
+    using DataType = std::tuple_element_t<0, Tuple>;
+
+    void Run(ck_tile::index_t m,
+             ck_tile::index_t n,
+             ck_tile::index_t x_stride = -1,
+             ck_tile::index_t y_stride = -1)
+    {
+        if(x_stride < 0)
+            x_stride = n;
+        if(y_stride < 0)
+            y_stride = n;
+
+        assert(x_stride >= n);
+
+        using TypeConfig = SmoothquantTypeConfig<DataType>;
+
+        using XDataType           = typename TypeConfig::XDataType;
+        using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
+        using YScaleDataType      = typename TypeConfig::YScaleDataType;
+        using QYDataType          = typename TypeConfig::QYDataType;
+        using ComputeDataType     = typename TypeConfig::ComputeDataType;
+
+        // host verify
+        ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
+        ck_tile::HostTensor<SmoothScaleDataType> smscale_host({n});
+
+        ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
+        ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
+
+        ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {y_stride, 1});
+        ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {y_stride, 1});
+
+        ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+        ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
+
+        ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+        x_buf.ToDevice(x_host.data());
+        smscale_buf.ToDevice(smscale_host.data());
+
+        std::cout << "m:" << m << ", n:" << n << ", x_stride:" << x_stride
+                  << ", y_stride:" << y_stride << std::flush;
+
+        smoothquant_args args{x_buf.GetDeviceBuffer(),
+                              smscale_buf.GetDeviceBuffer(),
+                              yscale_buf.GetDeviceBuffer(),
+                              qy_buf.GetDeviceBuffer(),
+                              m,
+                              n,
+                              x_stride,
+                              y_stride};
+
+        smoothquant<DataType>(args, ck_tile::stream_config{nullptr, false});
+
+        bool pass = true;
+
+        using YDataType = ComputeDataType;
+        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {y_stride, 1});
+        // smooth outlier
+        {
+            auto f = [&](auto n_) {
+                auto v_smscale = ck_tile::type_convert<ComputeDataType>(smscale_host(n_));
+
+                for(int m_ = 0; m_ < m; ++m_)
+                {
+                    auto v_x       = ck_tile::type_convert<ComputeDataType>(x_host(m_, n_));
+                    y_host(m_, n_) = v_x * v_smscale;
+                }
+            };
+
+            ck_tile::make_ParallelTensorFunctor(f, smscale_host.get_element_space_size())(
+                std::thread::hardware_concurrency());
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(y_stride == n)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride,
+                                                            qy_host_dev.begin() + i_r * y_stride +
+                                                                n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride,
+                                                            qy_host_ref.begin() + i_r * y_stride +
+                                                                n);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+
+        EXPECT_TRUE(pass);
+    }
+};
diff --git a/test/ck_tile/topk_softmax/test_topk_softmax_api.cpp b/test/ck_tile/topk_softmax/test_topk_softmax_api.cpp
index 46c7abc697..7c90c8200c 100644
--- a/test/ck_tile/topk_softmax/test_topk_softmax_api.cpp
+++ b/test/ck_tile/topk_softmax/test_topk_softmax_api.cpp
@@ -13,11 +13,11 @@
                                                                                                 \
     auto kargs = kernel::MakeKargs(a);                                                          \
                                                                                                 \
-    const dim3 grids      = kernel::GridSize(a);                                                \
-    constexpr dim3 blocks = kernel::BlockSize();                                                \
+    const dim3 grids  = kernel::GridSize(a);                                                    \
+    const dim3 blocks = kernel::BlockSize();                                                    \
                                                                                                 \
-    float ave_time = ck_tile::launch_kernel(                                                    \
-        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));               \
+    float ave_time =                                                                            \
+        ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(kernel{}, grids, blocks, 0, kargs));  \
                                                                                                 \
     return ave_time;
 
diff --git a/test/grouped_convnd_fwd/CMakeLists.txt b/test/grouped_convnd_fwd/CMakeLists.txt
index 4ceb4a2d99..24622fa0b5 100644
--- a/test/grouped_convnd_fwd/CMakeLists.txt
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
@@ -11,6 +11,10 @@ if(GPU_TARGETS MATCHES "gfx9")
     add_executable(test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_fwd_large_cases_xdl.cpp)
     target_compile_options(test_grouped_convnd_fwd_large_cases_xdl PRIVATE -Wno-global-constructors -Wno-undef)
     target_link_libraries(test_grouped_convnd_fwd_large_cases_xdl PRIVATE gtest_main getopt::getopt utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
+
+    add_executable(test_grouped_convnd_fwd_dataset_xdl test_grouped_convnd_fwd_dataset_xdl.cpp)
+    target_compile_options(test_grouped_convnd_fwd_dataset_xdl PRIVATE -Wno-global-constructors -Wno-undef)
+    target_link_libraries(test_grouped_convnd_fwd_dataset_xdl PRIVATE gtest_main getopt::getopt utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
 endif()
 
 add_gtest_executable(test_grouped_convnd_fwd_multi_ab_interface test_grouped_convnd_fwd_multi_ab_interface.cpp)
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
new file mode 100644
index 0000000000..ded68d9a44
--- /dev/null
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>          // Standard C library (exit codes, malloc)
+#include <iostream>         // C++ I/O streams (cout, cerr)
+#include <initializer_list> // C++ initializer list support (unused here)
+#include <vector>           // C++ vector container - stores test cases
+#include <fstream>          // File I/O for CSV reading
+#include <sstream>          // String stream for CSV parsing
+#include <string>           // String operations
+#include <gtest/gtest.h>    // Google Test framework - provides TYPED_TEST, EXPECT_TRUE
+
+#include "profiler/profile_grouped_conv_fwd_impl.hpp" // The actual GPU profiler that does convolution work
+
+// CSV Reader Function for Loading Test Cases
+// Reads convolution parameters from CSV file and returns vector of ConvParam structures
+std::vector<ck::utils::conv::ConvParam> load_csv_test_cases(const std::string& filename)
+{
+    std::vector<ck::utils::conv::ConvParam> conv_params; // Return vector
+    std::ifstream file(filename);                        // Open CSV file
+
+    if(!file.is_open())
+    {
+        std::cerr << "ERROR: Cannot open CSV file: " << filename << std::endl;
+        return conv_params; // Return empty vector on error
+    }
+
+    std::string line;
+    int line_number = 0;
+
+    // Read file line by line
+    while(std::getline(file, line))
+    {
+        line_number++;
+        std::cout << "Line " << line_number << ": " << line << std::endl;
+        // Skip comment lines (starting with #) and empty lines
+        if(line.empty() || line[0] == '#')
+        {
+            continue;
+        }
+
+        // Skip header line (contains column names)
+        if(line.find("NDim,Groups,BatchSize") != std::string::npos)
+        {
+            continue;
+        }
+
+        // Parse CSV line using stringstream
+        std::stringstream ss(line);
+        std::string cell;
+        std::vector<std::string> row;
+
+        // Split line by commas
+        while(std::getline(ss, cell, ','))
+        {
+            row.push_back(cell);
+        }
+
+        // Validate row has correct number of columns
+        if(row.size() < 19)
+        { // Need at least 19 columns for 2D (excluding TestName)
+            std::cerr << "WARNING: Line " << line_number << " has insufficient columns ("
+                      << row.size() << "), skipping" << std::endl;
+            continue;
+        }
+
+        try
+        {
+            // Parse CSV data into ConvParam structure
+            // CSV Format:
+            // NDim,Groups,BatchSize,OutChannels,InChannels,KernelH,KernelW,InputH,InputW,OutputH,OutputW,StrideH,StrideW,DilationH,DilationW,LeftPadH,LeftPadW,RightPadH,RightPadW,TestName
+            int NDim        = std::stoi(row[0]);
+            int Groups      = std::stoi(row[1]);
+            int BatchSize   = std::stoi(row[2]);
+            int OutChannels = std::stoi(row[3]);
+            int InChannels  = std::stoi(row[4]);
+
+            if(NDim == 2)
+            {
+                // 2D Convolution: {NDim, Groups, BatchSize, OutChannels, InChannels,
+                // {KernelH,KernelW}, {InputH,InputW}, {StrideH,StrideW}, {DilationH,DilationW},
+                // {LeftPadH,LeftPadW}, {RightPadH,RightPadW}}
+                ck::utils::conv::ConvParam param = {
+                    NDim,                                     // NDim = 2
+                    Groups,                                   // Groups
+                    BatchSize,                                // Batch size
+                    OutChannels,                              // Output channels
+                    InChannels,                               // Input channels
+                    {std::stoi(row[5]), std::stoi(row[6])},   // Kernel: {H, W}
+                    {std::stoi(row[7]), std::stoi(row[8])},   // Input: {H, W}
+                    {std::stoi(row[11]), std::stoi(row[12])}, // Stride: {H, W}
+                    {std::stoi(row[13]), std::stoi(row[14])}, // Dilation: {H, W}
+                    {std::stoi(row[15]), std::stoi(row[16])}, // Left pad: {H, W}
+                    {std::stoi(row[17]), std::stoi(row[18])}  // Right pad: {H, W}
+                };
+                conv_params.push_back(param);
+            }
+            else if(NDim == 3)
+            {
+                // 3D Convolution: Need more columns for 3D parameters
+                if(row.size() < 26)
+                {
+                    std::cerr << "WARNING: 3D convolution on line " << line_number
+                              << " needs 26+ columns, has " << row.size() << ", skipping"
+                              << std::endl;
+                    continue;
+                }
+                // 3D Convolution: {NDim, Groups, BatchSize, OutChannels, InChannels,
+                // {KernelD,KernelH,KernelW}, {InputD,InputH,InputW}, {OutputD,OutputH,OutputW},
+                // {StrideD,StrideH,StrideW}, {DilationD,DilationH,DilationW},
+                // {LeftPadD,LeftPadH,LeftPadW}, {RightPadD,RightPadH,RightPadW}}
+                ck::utils::conv::ConvParam param = {
+                    NDim,                                                       // NDim = 3
+                    Groups,                                                     // Groups
+                    BatchSize,                                                  // Batch size
+                    OutChannels,                                                // Output channels
+                    InChannels,                                                 // Input channels
+                    {std::stoi(row[5]), std::stoi(row[6]), std::stoi(row[7])},  // Kernel: {D, H, W}
+                    {std::stoi(row[8]), std::stoi(row[9]), std::stoi(row[10])}, // Input: {D, H, W}
+                    {std::stoi(row[14]),
+                     std::stoi(row[15]),
+                     std::stoi(row[16])}, // Stride: {D, H, W}
+                    {std::stoi(row[17]),
+                     std::stoi(row[18]),
+                     std::stoi(row[19])}, // Dilation: {D, H, W}
+                    {std::stoi(row[20]),
+                     std::stoi(row[21]),
+                     std::stoi(row[22])}, // Left pad: {D, H, W}
+                    {std::stoi(row[23]),
+                     std::stoi(row[24]),
+                     std::stoi(row[25])} // Right pad: {D, H, W}
+                };
+                conv_params.push_back(param);
+            }
+            else
+            {
+                std::cerr << "WARNING: Unsupported NDim=" << NDim << " on line " << line_number
+                          << ", skipping" << std::endl;
+            }
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << "ERROR: Failed to parse line " << line_number << ": " << e.what()
+                      << std::endl;
+            continue;
+        }
+    }
+
+    file.close();
+    std::cout << "Loaded " << conv_params.size() << " test cases from " << filename << std::endl;
+    return conv_params;
+}
+
+// Template class that works with different data types and tensor layouts
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test // Inherit from Google Test base class
+{
+    protected:
+    using DataType =
+        std::tuple_element_t<0, Tuple>; // Extract data type from tuple (fp32, fp16, bf16, int8)
+    using InLayout =
+        std::tuple_element_t<1, Tuple>; // Extract input tensor layout (NHWGC, NDHWGC, etc.)
+    using WeiLayout =
+        std::tuple_element_t<2, Tuple>; // Extract weight tensor layout (GKYXC, GKZYXC, etc.)
+    using OutLayout =
+        std::tuple_element_t<3, Tuple>; // Extract output tensor layout (NHWGK, NDHWGK, etc.)
+    using IndexType = ck::long_index_t; // 64-bit integer type for tensor dimensions
+
+    // THE KEY CONTAINER: This stores all test case parameters
+    // Each test will push_back() ConvParam structures here
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    // Template function to run tests for N-dimensional spatial convolution (2D or 3D)
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty()); // Google Test assertion: ensure we have test cases
+        bool pass = true;                  // Track overall pass/fail across all test cases
+
+        // MAIN LOOP: Execute every test case that was added to conv_params
+        for(auto& param : conv_params)
+        {
+            // CALL THE ACTUAL GPU PROFILER - This is where convolution happens!
+            pass = pass &&
+                   ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                               InLayout,   // Input tensor layout
+                                                               WeiLayout,  // Weight tensor layout
+                                                               OutLayout,  // Output tensor layout
+                                                               DataType,   // Input data type
+                                                               DataType,   // Weight data type
+                                                               DataType,   // Output data type
+                                                               DataType,   // Accumulation type
+                                                               DataType,   // Bias type
+                                                               IndexType>( // Index type (int64)
+                       true, // do_verification: Compare GPU result with CPU reference
+                       1, // init_method: How to initialize random test data (1 = uniform -5 to 5)
+                       false,  // do_log: Don't print detailed tensor values
+                       false,  // time_kernel: Don't do performance timing (just correctness)
+                       param); // ConvParam: {NDim, Groups, Batch, OutChannels, InChannels,
+                               // KernelSize, InputSize, ...}
+        }
+        EXPECT_TRUE(pass); // Google Test assertion: ALL test cases must pass
+    }
+};
+
+using namespace ck::tensor_layout::convolution; // Import tensor layout names (NHWGC, GKYXC, etc.)
+
+// GOOGLE TEST TYPE COMBINATIONS: Define what data types and layouts to test
+// This creates 4 separate test instances for 2D convolution:
+using KernelTypes2d =
+    ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK>,       // fp32 test
+                     std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,  // fp16 test
+                     std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>, // bfloat16 test
+                     std::tuple<int8_t, NHWGC, GKYXC, NHWGK>>;     // int8 test
+
+// This creates 3 separate test instances for 3D convolution (no int8 support for 3D):
+using KernelTypes3d =
+    ::testing::Types<std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,        // fp32 3D test
+                     std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,   // fp16 3D test
+                     std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>; // bfloat16 3D test
+
+// Create specialized test classes that inherit from the base template class
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple> // 2D convolution test class
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple> // 3D convolution test class
+{
+};
+
+// GOOGLE TEST MAGIC: Create test suites
+// This tells Google Test to create 4 test instances for 2D (fp32, fp16, bf16, int8)
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+// This tells Google Test to create 3 test instances for 3D (fp32, fp16, bf16)
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+// THE ACTUAL 2D TEST - This runs 4 times (once for each data type: fp32, fp16, bf16, int8)
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    // LOAD TEST CASES FROM CSV FILE instead of hardcoded cases
+    // Try different locations for the CSV file (build directory vs source directory)
+    std::vector<std::string> csv_paths = {
+        "../test_data/conv_test_set_2d_dataset.csv", // From build directory to source
+    };
+
+    bool loaded = false;
+    for(const auto& csv_path : csv_paths)
+    {
+        auto csv_cases = load_csv_test_cases(csv_path);
+        if(!csv_cases.empty())
+        {
+            // Successfully loaded CSV data - add all test cases to conv_params
+            for(const auto& test_case : csv_cases)
+            {
+                this->conv_params.push_back(test_case);
+            }
+            std::cout << "Loaded " << csv_cases.size() << " 2D test cases from " << csv_path
+                      << std::endl;
+            loaded = true;
+            break;
+        }
+    }
+
+    // FAIL if CSV loading fails - no fallback!
+    if(!loaded)
+    {
+        std::cerr << "ERROR: Failed to load CSV test data from any of these locations:"
+                  << std::endl;
+        for(const auto& path : csv_paths)
+        {
+            std::cerr << "  - " << path << std::endl;
+        }
+        std::cerr << "\nPlease ensure CSV test data exists in one of these locations." << std::endl;
+        std::cerr << "Run generate_test_dataset.sh in test_data/ to create test datasets."
+                  << std::endl;
+
+        // Force test failure - no test cases means test should fail
+        EXPECT_TRUE(loaded) << "CSV test data loading failed";
+    }
+
+    // Execute all test cases with 2D convolution
+    // This calls Run<2>() which loops through conv_params and calls GPU profiler for each
+    this->template Run<2>();
+}
+
+// THE ACTUAL 3D TEST - This runs 3 times (once for each data type: fp32, fp16, bf16)
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    // LOAD TEST CASES FROM CSV FILE instead of hardcoded cases
+    // Try different locations for the CSV file (build directory vs source directory)
+    std::vector<std::string> csv_paths = {
+        "../test_data/conv_test_set_3d_dataset.csv", // From build directory to source
+    };
+
+    bool loaded = false;
+    for(const auto& csv_path : csv_paths)
+    {
+        auto csv_cases = load_csv_test_cases(csv_path);
+        if(!csv_cases.empty())
+        {
+            // Successfully loaded CSV data - add all test cases to conv_params
+            for(const auto& test_case : csv_cases)
+            {
+                this->conv_params.push_back(test_case);
+            }
+            std::cout << "Loaded " << csv_cases.size() << " 3D test cases from " << csv_path
+                      << std::endl;
+            loaded = true;
+            break;
+        }
+    }
+
+    // FAIL if CSV loading fails - no fallback!
+    if(!loaded)
+    {
+        std::cerr << "ERROR: Failed to load CSV test data from any of these locations:"
+                  << std::endl;
+        for(const auto& path : csv_paths)
+        {
+            std::cerr << "  - " << path << std::endl;
+        }
+        std::cerr << "\nPlease ensure CSV test data exists in one of these locations." << std::endl;
+        std::cerr << "Run generate_test_dataset.sh in test_data/ to create test datasets."
+                  << std::endl;
+
+        // Force test failure - no test cases means test should fail
+        EXPECT_TRUE(loaded) << "CSV test data loading failed";
+    }
+
+    // Execute all test cases with 3D convolution
+    // This calls Run<3>() which loops through conv_params and calls GPU profiler for each
+    this->template Run<3>();
+}
diff --git a/test_data/generate_model_configs.py b/test_data/generate_model_configs.py
new file mode 100644
index 0000000000..d799c0fb94
--- /dev/null
+++ b/test_data/generate_model_configs.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+Generate Model Configuration Combinations for MIOpen Testing
+
+This script generates all possible combinations of model parameters
+and saves them as CSV files that can be read by the shell script.
+"""
+
+import csv
+import itertools
+import argparse
+
+def generate_2d_configs():
+    """Generate all 2D model configuration combinations"""
+    
+    # Define parameter ranges
+    models_2d = [
+        'resnet18', 'resnet34', 'resnet50', 
+        'mobilenet_v2', 'mobilenet_v3_large', 'mobilenet_v3_small',
+        'vgg11', 'vgg16', 'vgg19',
+        'alexnet', 'googlenet',
+        'densenet121', 'densenet161',
+        'squeezenet1_0', 'squeezenet1_1',
+        'shufflenet_v2_x1_0'
+    ]
+    
+    batch_sizes = [1, 4, 8, 16, 32]
+    
+    # Input dimensions: (height, width)
+    input_dims = [
+        (64, 64), (128, 128), (224, 224), (256, 256), (512, 512),  # Square
+        (224, 320), (224, 448), (320, 224), (448, 224),            # Rectangular
+        (227, 227),  # AlexNet preferred
+        (299, 299)   # Inception preferred
+    ]
+    
+    precisions = ['fp32'] #, 'fp16', 'bf16']
+    channels = [3]  # Most models expect RGB
+    
+    configs = []
+    config_id = 1
+    
+    # Generate all combinations (but limit to reasonable subset)
+    for model in models_2d:
+        for batch_size in batch_sizes:
+            for height, width in input_dims:
+                for precision in precisions:
+                    # Skip some combinations to keep dataset manageable
+                    if batch_size > 16 and height > 256:
+                        continue  # Skip large batch + large image combinations
+                    if precision != 'fp32' and batch_size < 8:
+                        continue  # Skip mixed precision with tiny batches
+                    
+                    config_name = f"{model}_b{batch_size}_{height}x{width}_{precision}"
+                    
+                    config = {
+                        'config_name': config_name,
+                        'model': model,
+                        'batch_size': batch_size,
+                        'channels': channels[0],
+                        'height': height,
+                        'width': width,
+                        'precision': precision
+                    }
+                    
+                    configs.append(config)
+                    config_id += 1
+    
+    return configs
+
+def generate_3d_configs():
+    """Generate all 3D model configuration combinations"""
+    
+    models_3d = ['r3d_18', 'mc3_18', 'r2plus1d_18']
+    
+    batch_sizes = [1, 2, 4, 8]  # 3D models are more memory intensive
+    temporal_sizes = [8, 16, 32]
+    
+    # 3D input dimensions: (height, width) 
+    input_dims = [
+        (112, 112), (224, 224), (256, 256),  # Standard sizes
+        (224, 320), (320, 224)               # Rectangular
+    ]
+    
+    precisions = ['fp32'] #, 'fp16']  # Skip bf16 for 3D to reduce combinations
+    channels = [3]
+    
+    configs = []
+    
+    for model in models_3d:
+        for batch_size in batch_sizes:
+            for temporal_size in temporal_sizes:
+                for height, width in input_dims:
+                    for precision in precisions:
+                        # Skip very large combinations
+                        if batch_size > 4 and temporal_size > 16:
+                            continue
+                        if batch_size > 2 and height > 224:
+                            continue
+                            
+                        config_name = f"{model}_b{batch_size}_t{temporal_size}_{height}x{width}_{precision}"
+                        
+                        config = {
+                            'config_name': config_name,
+                            'model': model,
+                            'batch_size': batch_size,
+                            'channels': channels[0],
+                            'temporal_size': temporal_size,
+                            'height': height,
+                            'width': width,
+                                'precision': precision
+                            }
+                        
+                        configs.append(config)
+    
+    return configs
+
+def save_configs_to_csv(configs, filename, config_type):
+    """Save configurations to CSV file"""
+    
+    if not configs:
+        print(f"No {config_type} configurations generated")
+        return
+    
+    fieldnames = list(configs[0].keys())
+    
+    with open(filename, 'w', newline='\n', encoding='utf-8') as csvfile:
+        csvfile.write(f"# {config_type} Model Configurations\n")
+        csvfile.write(f"# Generated {len(configs)} configurations\n")
+        
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
+        writer.writeheader()
+        
+        for config in configs:
+            writer.writerow(config)
+    
+    print(f"Generated {len(configs)} {config_type} configurations → {filename}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate model configuration combinations')
+    parser.add_argument('--output-2d', type=str, default='model_configs_2d.csv',
+                       help='Output file for 2D configurations')
+    parser.add_argument('--output-3d', type=str, default='model_configs_3d.csv', 
+                       help='Output file for 3D configurations')
+    parser.add_argument('--limit', type=int, 
+                       help='Limit number of configurations per type (for testing)')
+    
+    args = parser.parse_args()
+    
+    print("Generating 2D model configurations...")
+    configs_2d = generate_2d_configs()
+    if args.limit:
+        configs_2d = configs_2d[:args.limit]
+    save_configs_to_csv(configs_2d, args.output_2d, "2D")
+    
+    print("Generating 3D model configurations...")
+    configs_3d = generate_3d_configs()
+    if args.limit:
+        configs_3d = configs_3d[:args.limit]
+    save_configs_to_csv(configs_3d, args.output_3d, "3D")
+    
+    print(f"\nTotal configurations: {len(configs_2d)} 2D + {len(configs_3d)} 3D = {len(configs_2d) + len(configs_3d)}")
+    print("\nTo use these configurations:")
+    print("  Update generate_test_dataset.sh to read from these CSV files")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/test_data/generate_test_dataset.sh b/test_data/generate_test_dataset.sh
new file mode 100755
index 0000000000..621ea4f144
--- /dev/null
+++ b/test_data/generate_test_dataset.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+# Generate Comprehensive Convolution Test Dataset for CK
+# This script captures MIOpen commands from PyTorch models and generates test cases
+
+set -e  # Exit on error
+
+# Check if target files already exist
+# if [ -f "conv_test_set_2d_dataset.csv" ] && [ -f "conv_test_set_3d_dataset.csv" ]; then
+#     echo "Target files already exist:"
+#     [ -f "conv_test_set_2d_dataset.csv" ] && echo "  - conv_test_set_2d_dataset.csv ($(wc -l < conv_test_set_2d_dataset.csv) lines)"
+#     [ -f "conv_test_set_3d_dataset.csv" ] && echo "  - conv_test_set_3d_dataset.csv ($(wc -l < conv_test_set_3d_dataset.csv) lines)"
+#     echo ""
+#     echo "To regenerate, please remove these files first:"
+#     echo "  rm conv_test_set_2d_dataset.csv conv_test_set_3d_dataset.csv"
+#     exit 0
+# fi
+
+echo "=========================================="
+echo "CK Convolution Test Dataset Generator"
+echo "=========================================="
+
+# Configuration
+OUTPUT_DIR="generated_datasets"
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+MAX_ITERATIONS=0  # Maximum number of iterations per model type (set to 0 for unlimited)
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+PURPLE='\033[0;35m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+# Create output directory
+rm -rf "$OUTPUT_DIR"
+mkdir -p $OUTPUT_DIR
+
+echo ""
+echo "Step 1: Generating model configurations"
+echo "-----------------------------------------"
+
+# Generate model configuration files (with limit for testing)
+echo "Generating model configuration files..."
+python3 generate_model_configs.py \
+    --output-2d $OUTPUT_DIR/model_configs_2d.csv \
+    --output-3d $OUTPUT_DIR/model_configs_3d.csv 
+
+if [ ! -f "$OUTPUT_DIR/model_configs_2d.csv" ] || [ ! -f "$OUTPUT_DIR/model_configs_3d.csv" ]; then
+    echo "ERROR: Failed to generate configuration files"
+    exit 1
+fi
+
+
+# Check if running on GPU
+if ! command -v rocm-smi &> /dev/null; then
+    echo "WARNING: ROCm not detected. Models will run on CPU (no MIOpen commands)."
+    echo "For actual MIOpen commands, run this on a system with AMD GPU."
+fi
+
+
+echo ""
+echo "Step 2: Running 2D/3D models and capturing MIOpen commands"
+echo "-----------------------------------------"
+
+
+# Process 2D models from CSV configuration file
+echo "Processing 2D models from $OUTPUT_DIR/model_configs_2d.csv..."
+
+# Count total configurations (excluding comments and header)
+TOTAL_CONFIGS=$(grep -v "^#" $OUTPUT_DIR/model_configs_2d.csv | tail -n +2 | wc -l)
+CURRENT_CONFIG=0
+
+echo "Total configurations to process: $TOTAL_CONFIGS"
+echo ""
+
+# Read 2D configurations from CSV (skip comments and header)
+while IFS=',' read -r config_name model batch_size channels height width precision; do
+    # Skip comments and empty lines
+    [[ "$config_name" =~ ^#.*$ ]] && continue
+    [[ "$config_name" == "config_name" ]] && continue  # Skip header
+    [[ -z "$config_name" ]] && continue
+    
+    # Increment counter
+    CURRENT_CONFIG=$((CURRENT_CONFIG + 1))
+    
+    # Stop after MAX_ITERATIONS if set
+    if [ $MAX_ITERATIONS -gt 0 ] && [ $CURRENT_CONFIG -gt $MAX_ITERATIONS ]; then
+        echo -e "${RED}Stopping after $MAX_ITERATIONS iterations (testing mode)${NC}"
+        break
+    fi
+    
+    # Build configuration command
+    CONFIG="--model $model --batch-size $batch_size --channels $channels --height $height --width $width --precision $precision"
+    CONFIG_NAME="$config_name"
+    
+    echo -e "${GREEN}[${CURRENT_CONFIG}/${TOTAL_CONFIGS}]${NC} ${PURPLE}Running MIOpenDriver${NC} ${CYAN}2D${NC} ${YELLOW}$CONFIG_NAME${NC}: ${BLUE}$CONFIG${NC}"
+    
+    # Actual run with logging
+    MIOPEN_ENABLE_LOGGING_CMD=1 python3 run_model_with_miopen.py \
+        --model $model --batch-size $batch_size --channels $channels --height $height --width $width --precision $precision \
+        2>> $OUTPUT_DIR/${model}_miopen_log_2d.txt || true 
+
+
+done < $OUTPUT_DIR/model_configs_2d.csv
+
+# Process 3D models from CSV configuration file
+echo "Processing 3D models from $OUTPUT_DIR/model_configs_3d.csv..."
+
+# Count total 3D configurations (excluding comments and header)
+TOTAL_3D_CONFIGS=$(grep -v "^#" $OUTPUT_DIR/model_configs_3d.csv | tail -n +2 | wc -l)
+CURRENT_3D_CONFIG=0
+
+echo "Total 3D configurations to process: $TOTAL_3D_CONFIGS"
+echo ""
+
+# Read 3D configurations from CSV (skip comments and header)
+while IFS=',' read -r config_name model batch_size channels temporal_size height width precision; do
+    # Skip comments and empty lines  
+    [[ "$config_name" =~ ^#.*$ ]] && continue
+    [[ "$config_name" == "config_name" ]] && continue  # Skip header
+    [[ -z "$config_name" ]] && continue
+    
+    # Increment counter
+    CURRENT_3D_CONFIG=$((CURRENT_3D_CONFIG + 1))
+    
+    # Stop after MAX_ITERATIONS if set
+    if [ $MAX_ITERATIONS -gt 0 ] && [ $CURRENT_3D_CONFIG -gt $MAX_ITERATIONS ]; then
+        echo -e "${RED}Stopping after $MAX_ITERATIONS iterations (testing mode)${NC}"
+        break
+    fi
+
+    # Build configuration command for 3D models
+    CONFIG="--model $model --batch-size $batch_size --channels $channels --temporal-size $temporal_size --height $height --width $width --precision $precision"
+    CONFIG_NAME="$config_name"
+    
+    echo -e "${GREEN}[${CURRENT_3D_CONFIG}/${TOTAL_3D_CONFIGS}]${NC} ${PURPLE}Running MIOpenDriver${NC} ${CYAN}3D${NC} ${YELLOW}$CONFIG_NAME${NC}: ${BLUE}$CONFIG${NC}"
+    
+    
+    # Actual run with logging
+    MIOPEN_ENABLE_LOGGING_CMD=1 python3 run_model_with_miopen.py \
+        --model $model --batch-size $batch_size --channels $channels --temporal-size $temporal_size --height $height --width $width --precision $precision \
+        2>> $OUTPUT_DIR/${model}_miopen_log_3d.txt || true
+
+done < $OUTPUT_DIR/model_configs_3d.csv
+
+
+echo ""
+echo "Step 3: Converting MIOpen commands to CSV test cases"
+echo "-----------------------------------------"
+
+# Convert 2D MIOpen logs to CSV
+echo "Converting 2D MIOpen logs to CSV..."
+for log_file in $OUTPUT_DIR/*_miopen_log_2d.txt; do
+    if [ -f "$log_file" ]; then
+        # Extract model name from filename (e.g., resnet_miopen_log_2d.txt -> resnet)
+        base_name=$(basename "$log_file" _miopen_log_2d.txt)
+        output_csv="$OUTPUT_DIR/${base_name}_cases_2d.csv"
+        
+        echo "  Converting $log_file -> $output_csv"
+        python3 miopen_to_csv.py \
+            --input "$log_file" \
+            --output-2d "$output_csv" \
+            --model-name "$base_name" \
+            --filter-duplicates || true
+    fi
+done
+
+# Convert 3D MIOpen logs to CSV
+echo "Converting 3D MIOpen logs to CSV..."
+for log_file in $OUTPUT_DIR/*_miopen_log_3d.txt; do
+    if [ -f "$log_file" ]; then
+        # Extract model name from filename (e.g., resnet3d_18_miopen_log_3d.txt -> resnet3d_18)
+        base_name=$(basename "$log_file" _miopen_log_3d.txt)
+        output_csv="$OUTPUT_DIR/${base_name}_cases_3d.csv"
+        
+        echo "  Converting $log_file -> $output_csv"
+        python3 miopen_to_csv.py \
+            --input "$log_file" \
+            --output-3d "$output_csv" \
+            --model-name "$base_name" \
+            --filter-duplicates || true
+    fi
+done
+
+echo ""
+echo "Step 4: Combining CSV files into final datasets"
+echo "-----------------------------------------"
+
+# Combine all 2D CSV files into one
+echo "Combining all 2D test cases..."
+# First create empty file with comment headers
+echo "# 2D Convolution Test Cases" > conv_test_set_2d_dataset.csv
+echo "# Combined from multiple models" >> conv_test_set_2d_dataset.csv
+# Add header from first file as a comment
+first_2d_file=$(ls $OUTPUT_DIR/*_cases_2d.csv 2>/dev/null | head -1)
+if [ -f "$first_2d_file" ]; then
+    # Get the CSV header line and prefix with #
+    header_line=$(grep "^NDim," "$first_2d_file" | head -1)
+    if [ ! -z "$header_line" ]; then
+        echo "# $header_line" >> conv_test_set_2d_dataset.csv
+    fi
+fi
+# Append all data rows (skip comment lines and CSV header) from all files
+for csv_file in $OUTPUT_DIR/*_cases_2d.csv; do
+    if [ -f "$csv_file" ]; then
+        # Skip lines starting with # and the NDim header line
+        grep -v "^#" "$csv_file" | grep -v "^NDim," >> conv_test_set_2d_dataset.csv 2>/dev/null || true
+    fi
+done
+
+# Combine all 3D CSV files into one
+echo "Combining all 3D test cases..."
+# First create empty file with comment headers
+echo "# 3D Convolution Test Cases" > conv_test_set_3d_dataset.csv
+echo "# Combined from multiple models" >> conv_test_set_3d_dataset.csv
+# Add header from first file as a comment
+first_3d_file=$(ls $OUTPUT_DIR/*_cases_3d.csv 2>/dev/null | head -1)
+if [ -f "$first_3d_file" ]; then
+    # Get the CSV header line and prefix with #
+    header_line=$(grep "^NDim," "$first_3d_file" | head -1)
+    if [ ! -z "$header_line" ]; then
+        echo "# $header_line" >> conv_test_set_3d_dataset.csv
+    fi
+fi
+# Append all data rows (skip comment lines and CSV header) from all files
+for csv_file in $OUTPUT_DIR/*_cases_3d.csv; do
+    if [ -f "$csv_file" ]; then
+        # Skip lines starting with # and the NDim header line
+        grep -v "^#" "$csv_file" | grep -v "^NDim," >> conv_test_set_3d_dataset.csv 2>/dev/null || true
+    fi
+done
+
+# Count test cases
+COUNT_2D=0
+COUNT_3D=0
+if [ -f "conv_test_set_2d_dataset.csv" ]; then
+    COUNT_2D=$(grep -v "^#" conv_test_set_2d_dataset.csv | tail -n +2 | wc -l)
+fi
+if [ -f "conv_test_set_3d_dataset.csv" ]; then
+    COUNT_3D=$(grep -v "^#" conv_test_set_3d_dataset.csv | tail -n +2 | wc -l)
+fi
+
+echo ""
+echo "=========================================="
+echo "Dataset Generation Complete!"
+echo "=========================================="
+echo ""
+echo "Generated files:"
+if [ $COUNT_2D -gt 0 ]; then
+    echo "  - conv_test_set_2d_dataset.csv: $COUNT_2D test cases"
+fi
+if [ $COUNT_3D -gt 0 ]; then
+    echo "  - conv_test_set_3d_dataset.csv: $COUNT_3D test cases"
+fi
+echo "  - Intermediate files in: $OUTPUT_DIR/"
+echo ""
+echo "To use these datasets:"
+echo "  1. Build the test: cd ../script && make -j64 test_grouped_convnd_fwd_dataset_xdl"
+echo "  2. Run the test: ./bin/test_grouped_convnd_fwd_dataset_xdl"
+echo ""
\ No newline at end of file
diff --git a/test_data/miopen_to_csv.py b/test_data/miopen_to_csv.py
new file mode 100644
index 0000000000..ae8c187b43
--- /dev/null
+++ b/test_data/miopen_to_csv.py
@@ -0,0 +1,363 @@
+#!/usr/bin/env python3
+"""
+Convert MIOpen Driver Commands to CSV Test Cases
+
+Parses MIOpen driver commands from log files and converts them to CSV format
+for CK convolution testing.
+
+Usage:
+    python3 miopen_to_csv.py --input miopen_commands.txt --output conv_cases.csv
+    python3 miopen_to_csv.py --input miopen_log.txt --output-2d conv_2d.csv --output-3d conv_3d.csv
+"""
+
+import argparse
+import csv
+import re
+import os
+
+def parse_miopen_command(command_line):
+    """
+    Parse MIOpen driver command line into parameter dictionary
+    
+    Example input:
+    ./bin/MIOpenDriver conv -n 4 -c 3 -H 224 -W 224 -k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g 1 -F 1 -t 1
+    
+    Returns dict with parsed parameters or None if parsing fails
+    """
+    if not command_line.strip().startswith('./bin/MIOpenDriver conv'):
+        return None
+    
+    # Extract parameters using regex
+    params = {}
+    
+    # Parameter mapping: flag -> description
+    # Support both short (-D) and long (--in_d) parameter formats
+    param_patterns = {
+        'n': r'-n\s+(\d+)',      # batch size
+        'c': r'-c\s+(\d+)',      # input channels  
+        'k': r'-k\s+(\d+)',      # output channels
+        'H': r'-H\s+(\d+)',      # input height
+        'W': r'-W\s+(\d+)',      # input width
+        'D': r'(?:-D|--in_d)\s+(\d+)',      # input depth (3D only) - supports both -D and --in_d
+        'y': r'-y\s+(\d+)',      # kernel height
+        'x': r'-x\s+(\d+)',      # kernel width  
+        'z': r'(?:-z|--fil_d)\s+(\d+)',      # kernel depth (3D only) - supports both -z and --fil_d
+        'u': r'-u\s+(\d+)',      # stride height
+        'v': r'-v\s+(\d+)',      # stride width
+        'w': r'(?:-w|--conv_stride_d)\s+(\d+)',      # stride depth (3D only) - supports both -w and --conv_stride_d
+        'p': r'-p\s+(\d+)',      # pad height
+        'q': r'-q\s+(\d+)',      # pad width
+        's': r'(?:-s|--pad_d)\s+(\d+)',      # pad depth (3D only) - supports both -s and --pad_d
+        'l': r'-l\s+(\d+)',      # dilation height
+        'j': r'-j\s+(\d+)',      # dilation width
+        'r': r'(?:-r|--dilation_d)\s+(\d+)',      # dilation depth (3D only) - supports both -r and --dilation_d
+        'g': r'-g\s+(\d+)',      # groups
+        'F': r'-F\s+(\d+)',      # direction (1=fwd, 2=bwd_weight, 4=bwd_data)
+    }
+    
+    for param, pattern in param_patterns.items():
+        match = re.search(pattern, command_line)
+        if match:
+            params[param] = int(match.group(1))
+    
+    return params if params else None
+
+def miopen_to_conv_param(miopen_params):
+    """
+    Convert MIOpen parameters to CK ConvParam format
+    
+    Returns dictionary in CSV format or None if conversion fails
+    """
+    if not miopen_params:
+        return None
+    
+    # Determine if 2D or 3D convolution
+    is_3d = 'D' in miopen_params or 'z' in miopen_params or 'w' in miopen_params or 'r' in miopen_params or 's' in miopen_params
+    
+    # Extract basic parameters with defaults
+    ndim = 3 if is_3d else 2
+    groups = miopen_params.get('g', 1)
+    batch_size = miopen_params.get('n', 1)
+    # MIOpen uses total channels (C*G), CK uses channels per group
+    out_channels_total = miopen_params.get('k', 64)
+    in_channels_total = miopen_params.get('c', 3)
+    out_channels = out_channels_total // groups  # CK format: channels per group
+    in_channels = in_channels_total // groups    # CK format: channels per group
+    
+    if is_3d:
+        # 3D convolution
+        kernel_d = miopen_params.get('z', 3)
+        kernel_h = miopen_params.get('y', 3)
+        kernel_w = miopen_params.get('x', 3)
+        
+        input_d = miopen_params.get('D', 16)
+        input_h = miopen_params.get('H', 32)
+        input_w = miopen_params.get('W', 32)
+        
+        stride_d = miopen_params.get('w', 1)
+        stride_h = miopen_params.get('u', 1)
+        stride_w = miopen_params.get('v', 1)
+        
+        dilation_d = miopen_params.get('r', 1)
+        dilation_h = miopen_params.get('l', 1)
+        dilation_w = miopen_params.get('j', 1)
+        
+        pad_d = miopen_params.get('s', 0)
+        pad_h = miopen_params.get('p', 0)
+        pad_w = miopen_params.get('q', 0)
+        
+        # Calculate output dimensions
+        output_d = (input_d + 2 * pad_d - dilation_d * (kernel_d - 1) - 1) // stride_d + 1
+        output_h = (input_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) // stride_h + 1
+        output_w = (input_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) // stride_w + 1
+        
+        # Skip invalid configurations
+        if output_d <= 0 or output_h <= 0 or output_w <= 0:
+            return None
+        
+        direction = miopen_params.get('F', 1)  # 1=fwd, 2=bwd_weight, 4=bwd_data
+        direction_name = {1: 'fwd', 2: 'bwd_weight', 4: 'bwd_data'}.get(direction, 'fwd')
+        
+        return {
+            'NDim': ndim,
+            'Groups': groups,
+            'BatchSize': batch_size,
+            'OutChannels': out_channels,
+            'InChannels': in_channels,
+            'KernelD': kernel_d, 'KernelH': kernel_h, 'KernelW': kernel_w,
+            'InputD': input_d, 'InputH': input_h, 'InputW': input_w,
+            'OutputD': output_d, 'OutputH': output_h, 'OutputW': output_w,
+            'StrideD': stride_d, 'StrideH': stride_h, 'StrideW': stride_w,
+            'DilationD': dilation_d, 'DilationH': dilation_h, 'DilationW': dilation_w,
+            'LeftPadD': pad_d, 'LeftPadH': pad_h, 'LeftPadW': pad_w,
+            'RightPadD': pad_d, 'RightPadH': pad_h, 'RightPadW': pad_w,
+            'TestName': f'MIOpen_3D_{direction_name}'
+        }
+    
+    else:
+        # 2D convolution
+        kernel_h = miopen_params.get('y', 3)
+        kernel_w = miopen_params.get('x', 3)
+        
+        input_h = miopen_params.get('H', 32)
+        input_w = miopen_params.get('W', 32)
+        
+        stride_h = miopen_params.get('u', 1)
+        stride_w = miopen_params.get('v', 1)
+        
+        dilation_h = miopen_params.get('l', 1)
+        dilation_w = miopen_params.get('j', 1)
+        
+        pad_h = miopen_params.get('p', 0)
+        pad_w = miopen_params.get('q', 0)
+        
+        # Calculate output dimensions
+        output_h = (input_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) // stride_h + 1
+        output_w = (input_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) // stride_w + 1
+        
+        # Skip invalid configurations
+        if output_h <= 0 or output_w <= 0:
+            return None
+        
+        direction = miopen_params.get('F', 1)
+        direction_name = {1: 'fwd', 2: 'bwd_weight', 4: 'bwd_data'}.get(direction, 'fwd')
+        
+        return {
+            'NDim': ndim,
+            'Groups': groups,
+            'BatchSize': batch_size,
+            'OutChannels': out_channels,
+            'InChannels': in_channels,
+            'KernelH': kernel_h, 'KernelW': kernel_w,
+            'InputH': input_h, 'InputW': input_w,
+            'OutputH': output_h, 'OutputW': output_w,
+            'StrideH': stride_h, 'StrideW': stride_w,
+            'DilationH': dilation_h, 'DilationW': dilation_w,
+            'LeftPadH': pad_h, 'LeftPadW': pad_w,
+            'RightPadH': pad_h, 'RightPadW': pad_w,
+            'TestName': f'MIOpen_2D_{direction_name}'
+        }
+
+def write_csv_cases(test_cases, output_file, ndim):
+    """Write test cases to CSV file"""
+    if not test_cases:
+        print(f"No {ndim}D test cases to write")
+        return
+    
+    print(f"Writing {len(test_cases)} {ndim}D test cases to {output_file}")
+    
+    # Define CSV headers based on dimension
+    if ndim == 2:
+        headers = ['NDim', 'Groups', 'BatchSize', 'OutChannels', 'InChannels',
+                  'KernelH', 'KernelW', 'InputH', 'InputW', 'OutputH', 'OutputW',
+                  'StrideH', 'StrideW', 'DilationH', 'DilationW', 
+                  'LeftPadH', 'LeftPadW', 'RightPadH', 'RightPadW', 'TestName']
+    else:  # 3D
+        headers = ['NDim', 'Groups', 'BatchSize', 'OutChannels', 'InChannels',
+                  'KernelD', 'KernelH', 'KernelW', 'InputD', 'InputH', 'InputW', 
+                  'OutputD', 'OutputH', 'OutputW', 'StrideD', 'StrideH', 'StrideW',
+                  'DilationD', 'DilationH', 'DilationW', 
+                  'LeftPadD', 'LeftPadH', 'LeftPadW', 'RightPadD', 'RightPadH', 'RightPadW', 'TestName']
+    
+    with open(output_file, 'w', newline='') as csvfile:
+        # Write header comment
+        csvfile.write(f"# {ndim}D Convolution Test Cases from MIOpen Commands\n")
+        csvfile.write(f"# Generated {len(test_cases)} test cases\n")
+        
+        writer = csv.DictWriter(csvfile, fieldnames=headers)
+        writer.writeheader()
+        
+        for test_case in test_cases:
+            # Only write fields that exist in headers
+            filtered_case = {k: v for k, v in test_case.items() if k in headers}
+            writer.writerow(filtered_case)
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert MIOpen commands to CSV test cases')
+    
+    parser.add_argument('--input', type=str, required=True,
+                       help='Input file with MIOpen driver commands')
+    parser.add_argument('--output', type=str,
+                       help='Output CSV file (for mixed 2D/3D cases)')
+    parser.add_argument('--output-2d', type=str, default='miopen_conv_2d.csv',
+                       help='Output CSV file for 2D cases')
+    parser.add_argument('--output-3d', type=str, default='miopen_conv_3d.csv',
+                       help='Output CSV file for 3D cases')
+    parser.add_argument('--filter-duplicates', action='store_true',
+                       help='Remove duplicate test cases')
+    parser.add_argument('--model-name', type=str, default='MIOpen',
+                       help='Model name to use in test case names (default: MIOpen)')
+    
+    args = parser.parse_args()
+    
+    if not os.path.exists(args.input):
+        print(f"ERROR: Input file not found: {args.input}")
+        return 1
+    
+    print(f"Parsing MIOpen commands from {args.input}...")
+    
+    test_cases_2d = []
+    test_cases_3d = []
+    total_lines = 0
+    parsed_lines = 0
+    
+    with open(args.input, 'r') as f:
+        for line_num, line in enumerate(f, 1):
+            total_lines += 1
+            line = line.strip()
+            
+            # Skip empty lines and non-MIOpen commands
+            # Handle both direct commands and logged commands with MIOpen prefix
+            if not line:
+                continue
+            
+            # Extract the actual MIOpenDriver command from logged format
+            if 'MIOpenDriver conv' in line:
+                # Extract command after finding MIOpenDriver
+                command_start = line.find('./bin/MIOpenDriver conv')
+                if command_start != -1:
+                    line = line[command_start:]
+                else:
+                    # Handle cases where path might be different - create standard format
+                    driver_start = line.find('MIOpenDriver conv')
+                    if driver_start != -1:
+                        line = './bin/' + line[driver_start:]
+                    else:
+                        continue
+            elif not line.startswith('./bin/MIOpenDriver conv'):
+                continue
+            
+            try:
+                # Parse MIOpen command
+                miopen_params = parse_miopen_command(line)
+                if not miopen_params:
+                    continue
+                
+                # Convert to ConvParam format
+                conv_param = miopen_to_conv_param(miopen_params)
+                if not conv_param:
+                    continue
+                
+                # Add model name to test name
+                conv_param['TestName'] = f"{args.model_name}_{conv_param['NDim']}D_fwd"
+                
+                # Separate 2D and 3D cases
+                if conv_param['NDim'] == 2:
+                    test_cases_2d.append(conv_param)
+                else:
+                    test_cases_3d.append(conv_param)
+                
+                parsed_lines += 1
+                
+            except Exception as e:
+                print(f"WARNING: Failed to parse line {line_num}: {e}")
+                continue
+    
+    print(f"Processed {total_lines} lines, parsed {parsed_lines} commands")
+    print(f"Found {len(test_cases_2d)} 2D cases, {len(test_cases_3d)} 3D cases")
+    
+    # Remove duplicates if requested
+    if args.filter_duplicates:
+        # Simple duplicate removal based on key parameters
+        def make_key(case):
+            if case['NDim'] == 2:
+                return (case['Groups'], case['BatchSize'], case['OutChannels'], case['InChannels'],
+                       case['KernelH'], case['KernelW'], case['InputH'], case['InputW'],
+                       case['StrideH'], case['StrideW'])
+            else:
+                return (case['Groups'], case['BatchSize'], case['OutChannels'], case['InChannels'],
+                       case['KernelD'], case['KernelH'], case['KernelW'], 
+                       case['InputD'], case['InputH'], case['InputW'],
+                       case['StrideD'], case['StrideH'], case['StrideW'])
+        
+        seen_2d = set()
+        unique_2d = []
+        for case in test_cases_2d:
+            key = make_key(case)
+            if key not in seen_2d:
+                seen_2d.add(key)
+                unique_2d.append(case)
+        
+        seen_3d = set()
+        unique_3d = []
+        for case in test_cases_3d:
+            key = make_key(case)
+            if key not in seen_3d:
+                seen_3d.add(key)
+                unique_3d.append(case)
+        
+        print(f"After deduplication: {len(unique_2d)} 2D cases, {len(unique_3d)} 3D cases")
+        test_cases_2d = unique_2d
+        test_cases_3d = unique_3d
+    
+    # Write output files
+    if args.output:
+        # Write mixed cases to single file
+        all_cases = test_cases_2d + test_cases_3d
+        if all_cases:
+            print(f"Writing {len(all_cases)} total cases to {args.output}")
+            # Use 2D headers for mixed file, extend as needed
+            mixed_headers = ['NDim', 'Groups', 'BatchSize', 'OutChannels', 'InChannels',
+                           'KernelH', 'KernelW', 'InputH', 'InputW', 'OutputH', 'OutputW',
+                           'StrideH', 'StrideW', 'DilationH', 'DilationW', 
+                           'LeftPadH', 'LeftPadW', 'RightPadH', 'RightPadW', 'TestName']
+            
+            with open(args.output, 'w', newline='') as csvfile:
+                csvfile.write(f"# Mixed 2D/3D Convolution Test Cases from MIOpen Commands\n")
+                writer = csv.DictWriter(csvfile, fieldnames=mixed_headers, extrasaction='ignore')
+                writer.writeheader()
+                for case in all_cases:
+                    writer.writerow(case)
+    else:
+        # Write separate files for 2D and 3D
+        if test_cases_2d:
+            write_csv_cases(test_cases_2d, args.output_2d, 2)
+        
+        if test_cases_3d:
+            write_csv_cases(test_cases_3d, args.output_3d, 3)
+    
+    print("Conversion completed!")
+    return 0
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/test_data/run_model_with_miopen.py b/test_data/run_model_with_miopen.py
new file mode 100644
index 0000000000..83d08c82b7
--- /dev/null
+++ b/test_data/run_model_with_miopen.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+PyTorch Model Runner with MIOpen Command Logging using torchvision models
+
+Usage:
+    MIOPEN_ENABLE_LOGGING_CMD=1 python3 run_model_with_miopen.py --model resnet18 2> miopen_commands.txt
+    
+Available 2D models: alexnet, vgg11, vgg16, resnet18, resnet50, mobilenet_v2, etc.
+Available 3D models: r3d_18, mc3_18, r2plus1d_18
+"""
+
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torchvision.models.video as video_models
+import argparse
+import os
+
+# Define available models
+MODELS_2D = [
+    'alexnet', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn',
+    'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152',
+    'resnext50_32x4d', 'resnext101_32x8d', 'resnext101_64x4d',
+    'wide_resnet50_2', 'wide_resnet101_2',
+    'densenet121', 'densenet161', 'densenet169', 'densenet201',
+    'inception_v3', 'googlenet',
+    'shufflenet_v2_x0_5', 'shufflenet_v2_x1_0', 'shufflenet_v2_x1_5', 'shufflenet_v2_x2_0',
+    'mobilenet_v2', 'mobilenet_v3_large', 'mobilenet_v3_small',
+    'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3',
+    'squeezenet1_0', 'squeezenet1_1'
+]
+
+MODELS_3D = [
+    'r3d_18', 'mc3_18', 'r2plus1d_18'
+]
+
+ALL_MODELS = MODELS_2D + MODELS_3D
+
+def main():
+    parser = argparse.ArgumentParser(description='PyTorch Model Runner with MIOpen Command Logging')
+    
+    # Model selection
+    parser.add_argument('--model', choices=ALL_MODELS, default='resnet18', 
+                       help='Model to run')
+    
+    # Input tensor dimensions
+    parser.add_argument('--batch-size', type=int, default=4, 
+                       help='Batch size')
+    parser.add_argument('--channels', type=int, default=3, 
+                       help='Input channels (e.g., 3 for RGB, 1 for grayscale)')
+    parser.add_argument('--height', type=int, default=224, 
+                       help='Input height')
+    parser.add_argument('--width', type=int, default=224, 
+                       help='Input width')
+    parser.add_argument('--input-size', type=int, 
+                       help='Input size (sets both height and width to same value)')
+    parser.add_argument('--temporal-size', type=int, default=16, 
+                       help='Temporal dimension for 3D models')
+    
+    # Device and precision
+    parser.add_argument('--device', choices=['cuda', 'cpu', 'auto'], default='auto',
+                       help='Device to run on')
+    parser.add_argument('--precision', choices=['fp32', 'fp16', 'bf16'], default='fp32',
+                       help='Floating point precision')
+    
+    
+    # Output control
+    parser.add_argument('--quiet', action='store_true',
+                       help='Suppress output except errors')
+    parser.add_argument('--verbose', action='store_true',
+                       help='Verbose output')
+    
+    args = parser.parse_args()
+    
+    # Handle input-size override
+    if args.input_size:
+        args.height = args.input_size
+        args.width = args.input_size
+    
+    # Check MIOpen logging
+    if not os.environ.get('MIOPEN_ENABLE_LOGGING_CMD') and not args.quiet:
+        print("WARNING: Set MIOPEN_ENABLE_LOGGING_CMD=1 to capture commands")
+    
+    # Device selection
+    if args.device == 'auto':
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    else:
+        device = torch.device(args.device)
+    
+    if not args.quiet:
+        print(f"Using device: {device}")
+    
+    # Create model using torchvision
+    if args.model in MODELS_3D:
+        # 3D Video models
+        model = getattr(video_models, args.model)(weights=None)
+        # 3D input: (batch, channels, temporal, height, width)
+        input_tensor = torch.randn(args.batch_size, args.channels, args.temporal_size, args.height, args.width)
+        if not args.quiet:
+            print(f"3D model: {args.model}")
+            print(f"Input shape: {input_tensor.shape} (B, C, T, H, W)")
+    else:
+        # 2D Image models
+        model = getattr(models, args.model)(weights=None)
+        # 2D input: (batch, channels, height, width)
+        input_tensor = torch.randn(args.batch_size, args.channels, args.height, args.width)
+        if not args.quiet:
+            print(f"2D model: {args.model}")
+            print(f"Input shape: {input_tensor.shape} (B, C, H, W)")
+    
+    # Set precision
+    if args.precision == 'fp16':
+        model = model.half()
+        input_tensor = input_tensor.half()
+    elif args.precision == 'bf16':
+        model = model.bfloat16()
+        input_tensor = input_tensor.bfloat16()
+    
+    model = model.to(device)
+    input_tensor = input_tensor.to(device)
+    
+    if not args.quiet:
+        print(f"Running {args.model} model...")
+    
+    # Run inference
+    model.eval()
+    with torch.no_grad():
+        output = model(input_tensor)
+        if not args.quiet:
+            print(f"Output shape: {output.shape}")
+    
+    if not args.quiet:
+        print("Done! MIOpen commands logged to stderr")
+
+if __name__ == "__main__":
+    main()
diff --git a/tile_engine/ops/CMakeLists.txt b/tile_engine/ops/CMakeLists.txt
index 0cf2c16da2..7d7002af1b 100644
--- a/tile_engine/ops/CMakeLists.txt
+++ b/tile_engine/ops/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(gemm)
+add_subdirectory(gemm_multi_d)
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index fe9b7802a7..42c114b499 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -3,6 +3,24 @@ set(GEMM_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon
 set(GEMM_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)")
 
 function(build_gemm_for_datatype datatype layout)
+    # Filter GPU targets to only gfx90a, gfx942, and gfx950
+    set(GEMM_GPU_TARGETS "")
+    set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
+    
+    foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+        if(target IN_LIST DESIRED_TARGETS)
+            list(APPEND GEMM_GPU_TARGETS ${target})
+        endif()
+    endforeach()
+    
+    # Skip compilation if no matching targets found
+    if(NOT GEMM_GPU_TARGETS)
+        message(WARNING "Skipping Tile Engine GEMM compilation: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+        return()
+    endif()
+    
+    message(STATUS "Building GEMM for GPU targets: ${GEMM_GPU_TARGETS}")
+    
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
 
     # Comment this if-else block when using user_provided_config
@@ -83,6 +101,7 @@ function(build_gemm_for_datatype datatype layout)
             if(chunk_files)
                 set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}_${layout}")
                 add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files})
+                set_property(TARGET ${sub_intermediate_lib_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS})
                 list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name})
             endif()
 
@@ -102,6 +121,7 @@ function(build_gemm_for_datatype datatype layout)
             
             add_library(${intermediate_lib_name} STATIC ${obj_exprs})
             add_dependencies(${intermediate_lib_name} gemm_gen_${datatype}_${layout})
+            set_property(TARGET ${intermediate_lib_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS})
             #foreach(objlib IN LISTS sub_intermediate_libs)
             #    target_sources(${intermediate_lib_name} PRIVATE $<TARGET_OBJECTS:${objlib}>)
             #endforeach()
@@ -132,6 +152,7 @@ function(build_gemm_for_datatype datatype layout)
     # Executable per datatype
     set(exec_name "benchmark_gemm_${datatype}_${layout}")
     add_executable(${exec_name} benchmark_gemm.cpp)
+    set_property(TARGET ${exec_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS})
     target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype}_${layout})
     target_compile_options(${exec_name} PRIVATE
         -Wno-undefined-func-template
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index a16b74d297..79152a1a0d 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -20,7 +20,7 @@ mkdir build && cd build
 # replace [Arch] with the appropriate architecture or leave blank and 
 # replace [Datatype1;Datatype2;...] in comma separated datatypes string (possible datatypes are [fp8, bf8, int8, fp16, bf16])
 # replace [Layout1;Layout2;...] in comma separated datatypes string (possible layouts are [rcr, rrr, crr, ccr])
-sh ../script/cmake-ck-dev.sh  ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
+../script/cmake-ck-dev.sh  ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
 # generate different executable for each passed datatype
 make benchmark_gemm_[Datatype1]_[Layout1] -j
 make benchmark_gemm_[Datatype1]_[Layout2] -j
@@ -38,7 +38,7 @@ rm -rf tile_engine/ && make benchmark_gemm_[Datatypes]_[Layout] -j  # rebuild
 ## For eaxmple build for gfx942 for fp8 and fp16 datatypes with rcr layout
 ``` bash
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr" 
+../script/cmake-ck-dev.sh  ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr" 
 make benchmark_gemm_fp8_rcr -j
 make benchmark_gemm_fp16_rcr -j
 ```
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index 4a990f3309..dd9de36865 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -54,7 +54,6 @@ CSHUFFLE_EPILOGUE = """
                                                              ck_tile::tuple<>,
                                                              CLayout,
                                                              ck_tile::element_wise::PassThrough,
-                                                             GemmPipelineProblem::kBlockSize,
                                                              TilePartitioner::MPerBlock,
                                                              TilePartitioner::NPerBlock,
                                                              WarpM,
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index 6d713bdcb8..7def4e2691 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -297,7 +297,7 @@ struct GemmKernel {{
                 throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
             }}
 
-            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 blocks = Kernel::BlockSize();
             const dim3 grids = {'Kernel::MaxOccupancyGridSize(stream)' if persistent == 'true' else 'Kernel::GridSize(args.M, args.N, args.k_batch)'};
 
             if(stream.log_level_ > 0)
@@ -346,12 +346,12 @@ struct GemmKernel {{
                 ave_time = ck_tile::launch_kernel_time_mask(
                     stream,
                     run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                    ck_tile::make_kernel<kBlockPerCu>(
                         Kernel{{}}, grids, blocks, 0, kargs));
             }}
             else{{
                 ave_time = ck_tile::launch_kernel(stream,
-                                          ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                          ck_tile::make_kernel<kBlockPerCu>(
                                               Kernel{{}}, grids, blocks, 0, kargs));
             }}
             return ave_time;
diff --git a/tile_engine/ops/gemm_multi_d/CMakeLists.txt b/tile_engine/ops/gemm_multi_d/CMakeLists.txt
new file mode 100644
index 0000000000..dc08e9cad3
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/CMakeLists.txt
@@ -0,0 +1,173 @@
+
+set(GEMM_MULTI_D_DATATYPE "fp16" CACHE STRING "List of datatypes for GEMM Multi D (semicolon-separated)")
+set(GEMM_MULTI_D_LAYOUT "rcrr" CACHE STRING "List of layout for GEMM Multi D(semicolon-separated)")
+set(GEMM_MULTI_D_ELEMENTWISE_FUNCTION "mul"  CACHE STRING "Elementwise function")
+
+function(build_gemm_multi_d_for_datatype_layout datatype layout)
+    # Filter GPU targets to only gfx90a, gfx942, and gfx950
+    set(GEMM_GPU_TARGETS "")
+    set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
+    
+    foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+        if(target IN_LIST DESIRED_TARGETS)
+            list(APPEND GEMM_GPU_TARGETS ${target})
+        endif()
+    endforeach()
+    
+    # Skip compilation if no matching targets found
+    if(NOT GEMM_GPU_TARGETS)
+        message(WARNING "Skipping Tile Engine GEMM Multi D compilation: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+        return()
+    endif()
+    
+    message(STATUS "Building GEMM Multi D for GPU targets: ${GEMM_GPU_TARGETS}")
+    
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
+
+    # Comment this if-else block when using user_provided_config
+    if(layout STREQUAL "rcrr")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+    else()
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
+    endif()
+
+    # uncomment this if you want to use user_provided_config.json
+    # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
+    
+    # Generate kernel list
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_multi_d_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --layout ${layout}
+                --elementwise_function ${GEMM_MULTI_D_ELEMENTWISE_FUNCTION}
+                --config_json ${json_blob}
+                --list_blobs
+        RESULT_VARIABLE ret
+    )
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}")
+    endif()
+
+    file(STRINGS "${working_path}/gemm_multi_d_instance_blobs.txt" codegen_blobs)
+    file(STRINGS "${working_path}/gemm_multi_d_instance_blobs_range.txt" codegen_blobs_range)
+    
+    # Generate the blobs
+    add_custom_command(
+        OUTPUT ${codegen_blobs}
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_multi_d_instance_builder.py
+                --working_path "${working_path}"
+                --datatype ${datatype}
+                --layout ${layout}
+                --elementwise_function ${GEMM_MULTI_D_ELEMENTWISE_FUNCTION}
+                --config_json "${json_blob}"
+                --gen_blobs
+        COMMENT "Generating GEMM Multi D instance sources for ${datatype} ${layout}"
+    )
+    add_custom_target(gemm_multi_d_gen_${datatype}_${layout} DEPENDS ${codegen_blobs})
+
+    set(intermediate_libs)
+    list(LENGTH codegen_blobs codegen_blobs_len)
+
+    foreach(blob IN LISTS codegen_blobs_range)
+        string(STRIP "${blob}" stripped_blob)
+        separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}")
+        # Each line is: <trait_name> <first_index_inclusive> <last_index_exclusive>   
+        list(GET spilit_blob 0 name)
+        list(GET spilit_blob 1 first)
+        list(GET spilit_blob 2 last)
+        math(EXPR total_files "${last} - ${first}")
+        if(total_files EQUAL 0)
+            continue()        # nothing for this trait
+        endif()
+
+        # Object libraries (chunked) per trait
+        set(sub_intermediate_libs)
+        set(chunk_size 3)
+        math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}")
+        math(EXPR num_chunks_minus_1 "${num_chunks} - 1")
+        
+        foreach(i RANGE 0 ${num_chunks_minus_1})
+            math(EXPR start "${first} + ${i} * ${chunk_size} ")
+            math(EXPR end "${start} + ${chunk_size} - 1")
+
+            set(chunk_files)
+            foreach(j RANGE ${start} ${end})
+                if(j LESS ${last} AND j LESS ${codegen_blobs_len})
+                    list(GET codegen_blobs ${j} f)
+                    list(APPEND chunk_files "${f}")
+                endif()
+            endforeach()
+
+            #list(LENGTH chunk_files chunk_files_len)
+            #if(chunk_files_len AND chunk_files_len GREATER 1)
+            if(chunk_files)
+                set(sub_intermediate_lib_name "gemm_multi_d_objlib_${name}_${i}_${datatype}_${layout}")
+                add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files})
+                set_property(TARGET ${sub_intermediate_lib_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS})
+                list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name})
+            endif()
+
+        endforeach()
+
+        # ------------------ Bundle the object libs into one static lib ---------
+        #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len)
+        #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1)
+        if(sub_intermediate_libs)
+            set(intermediate_lib_name "gemm_multi_d_staticlib_${name}_${datatype}_${layout}")
+            # Collect the $<TARGET_OBJECTS:...> expressions
+            
+            set(obj_exprs)
+            foreach(objlib IN LISTS sub_intermediate_libs)
+                list(APPEND obj_exprs $<TARGET_OBJECTS:${objlib}>)
+            endforeach()
+            
+            add_library(${intermediate_lib_name} STATIC ${obj_exprs})
+            add_dependencies(${intermediate_lib_name} gemm_multi_d_gen_${datatype}_${layout})
+            set_property(TARGET ${intermediate_lib_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS})
+            #foreach(objlib IN LISTS sub_intermediate_libs)
+            #    target_sources(${intermediate_lib_name} PRIVATE $<TARGET_OBJECTS:${objlib}>)
+            #endforeach()
+            list(APPEND intermediate_libs ${intermediate_lib_name})
+        endif()
+
+    endforeach()
+    
+    # Interface library for instances
+    add_library(gemm_multi_d_template_instances_${datatype}_${layout} INTERFACE)
+    add_dependencies(gemm_multi_d_template_instances_${datatype}_${layout} gemm_multi_d_gen_${datatype}_${layout})
+    target_link_libraries(gemm_multi_d_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs})
+    target_include_directories(gemm_multi_d_template_instances_${datatype}_${layout} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+    set_target_properties(gemm_multi_d_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX)
+    
+    # Host API interface library
+    add_library(gemm_multi_d_host_api_${datatype}_${layout} INTERFACE)
+    target_link_libraries(gemm_multi_d_host_api_${datatype}_${layout} INTERFACE gemm_multi_d_template_instances_${datatype}_${layout})
+    target_include_directories(gemm_multi_d_host_api_${datatype}_${layout} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+
+    
+
+    # Executable per datatype
+    set(exec_name "benchmark_gemm_multi_d_${datatype}_${layout}")
+    add_executable(${exec_name} benchmark_gemm_multi_d.cpp)
+    set_property(TARGET ${exec_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS})
+    target_link_libraries(${exec_name} PRIVATE gemm_multi_d_host_api_${datatype}_${layout})
+    target_compile_options(${exec_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+    )
+endfunction()
+
+# Process each datatype in isolation
+foreach(dt IN LISTS GEMM_MULTI_D_DATATYPE)
+    foreach(l IN LISTS GEMM_MULTI_D_LAYOUT)
+        build_gemm_multi_d_for_datatype_layout(${dt} ${l})
+    endforeach()
+endforeach()
diff --git a/tile_engine/ops/gemm_multi_d/README.md b/tile_engine/ops/gemm_multi_d/README.md
new file mode 100644
index 0000000000..66f0ed80af
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/README.md
@@ -0,0 +1,110 @@
+
+CK Tile Engine for GEMM Multi D is used to generate and run GEMM kernels with different combinations of BlockTile sizes, WarpTile sizes, WarpTile mapping for all valid pipelines, schedulers and epilogues while able to give custom datatype and Layout selections
+
+# Kernel Configurations
+
+# User Specific
+Users can specify custom kernel configurations such as tile size, warp size, padding, pipeline, scheduler, and epilogue in the config file. This allows building only for selected configurations, significantly reducing build time.
+For reference please see `./configs/user_provided_config.json`.
+
+# Default
+The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark. For reference please see in `./configs/default_config.json`
+
+If user does not provide kernel configuration, the tile engine uses default kernel configuration to generate kernel instances and benchmark. 
+
+## Build Instructions
+``` bash
+# in the root of composable kernel create build directory
+mkdir build && cd build
+# build composable kernel
+# replace [Arch] with the appropriate architecture or leave blank and 
+# replace [Datatype] in comma separated datatypes string (possible datatypes are [fp16])
+# replace [Layout1;Layout2;...] in comma separated datatypes string (possible layouts are [rcr, rrr, crr, ccr])
+# replace "mul" with either of mul,add,passthrough for Elementwise function as Multiply, Add or Passthrough respectively. If this is not specified it is considered as mul by default.
+../script/cmake-ck-dev.sh  ../ [Arch] -DGEMM_MULTI_D_DATATYPE="[Datatype]" -DGEMM_MULTI_D_LAYOUT="[Layout1;Layout2]" -DGEMM_MULTI_D_ELEMENTWISE_FUNCTION="mul"
+# generate different executable for each passed datatype
+make benchmark_gemm_multi_d_[Datatype]_[Layout1] -j
+make benchmark_gemm_multi_d_[Datatype]_[Layout2] -j
+```
+`benchmark_gemm_multi_d_[Datatype]_[Layout]` will be located in the `./bin/` directory.
+
+`benchmark_gemm_multi_d_[Datatype]_[Layout]` must be rebuilt everytime if configuration file is modified.
+
+``` bash
+rm -rf tile_engine/ && make benchmark_gemm_multi_d_[Datatype]_[Layout] -j  # rebuild
+```
+
+## For eaxmple build for gfx942 for datatype with rcr layout
+``` bash
+mkdir build && cd build
+../script/cmake-ck-dev.sh  ../ gfx942 -DGEMM_MULTI_D_DATATYPE="fp16" -DGEMM_MULTI_D_LAYOUT="rcrr" 
+make benchmark_gemm_multi_d_fp16_rcrr -j
+
+## benchmark_gemm inputs
+```
+                      -m    The value for m dimension. Default is 3840.
+                      -n    The value for n dimension. Default is 4096.
+                      -k    The value for k dimension. Default is 2048.
+               -stride_a    The stride value for tensor A. Default is 0.
+               -stride_b    The stride value for tensor B. Default is 0.
+              -stride_ds    The stride value for tensor Ds. Default is 0.
+               -stride_e    The stride value for tensor E. Default is 0.
+                -split_k    The split value for k dimension. Default is 1.
+                  -verify    The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 1, validation on CPU, as validation on GPU is not supported.
+                    -log    Wether output kernel instance information or not. Possible values are true or false. Default is false.
+                 -warmup    The number of iterations before benchmark the kernel. Default is 50.
+                 -repeat    The number of iterations to benchmark the kernel. Default is 100.
+                  -timer    Whether if the timer is gpu timer or not. Possible values are false or true. Default is true.
+                   -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
+            -flush_cache    To flush cache, possible values are true or false. Default is false.
+         -rotating_count    Number of iterations to rotate the cache. Default is 5.
+                 -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
+           -csv_filename    The filename of benchmark result. Default is gemm_multi_d_kernel.
+               -pipeline    The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.
+              -scheduler    The type of scheduler. Possible values are intrawave. Default is intrawave.
+               -epilogue    The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
+                  -pad_m    Whether pad or not in m direction. Possible values are true or false. Default is false.
+                  -pad_n    Whether pad or not in n direction. Possible values are true or false. Default is false.
+                  -pad_k    Whether pad or not in k direction. Possible values are true or false. Default is false.
+
+Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json 
+```
+Note: In `./configs/user_provided_config.json` pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be from one of the values specified above.
+
+## Example
+
+The following JSON file specifies parameters used to generate and build GEMM kernels across all possible combinations of pipelines, schedulers, epilogues with different tile and warp sizes.
+
+```json
+{     
+    /// other parameters ///
+    
+    "tile_m": {
+      "values": [256]
+    },
+    "tile_n": {
+      "values": [256]
+    },
+    "tile_k": {
+      "values": [64, 32]
+    },
+
+    /// other parameters ///
+
+    "pipeline": {
+      "values": ["compv3", "compv4", "mem"]
+    },
+    "scheduler": {
+      "values": ["intrawave", "interwave"]
+    },
+    "epilogue": {
+      "values": ["cshuffle"]
+    }
+}
+```
+
+At runtime, a specific subset of the generated kernels can be selected using command-line arguments.
+``` bash
+./bin/benchmark_gemm_multi_d_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=cshuffle 
+```
+The above command runs kernels configured with the compv3 pipeline, intrawave scheduler, and cshuffle epilogue, while sweeping over different BlockTile sizes, WarpTile sizes, and WarpTile mappings.
diff --git a/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.cpp b/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.cpp
new file mode 100644
index 0000000000..764a295809
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <tuple>
+#include <exception>
+
+#include "benchmark_gemm_multi_d.hpp"
+#include "gemm_multi_d_profiler.hpp"
+
+void benchmark_gemm_multi_d(const ck_tile::ArgParser& arg_parser)
+{
+    GemmMultiDProblem gemm_multi_d_problem{arg_parser.get_int("split_k"),
+                                           arg_parser.get_int("m"),
+                                           arg_parser.get_int("n"),
+                                           arg_parser.get_int("k"),
+                                           arg_parser.get_int("stride_a"),
+                                           arg_parser.get_int("stride_b"),
+                                           arg_parser.get_int("stride_ds"),
+                                           arg_parser.get_int("stride_ds"),
+                                           arg_parser.get_int("stride_e"),
+                                           DataTypeTraits<ADataType>::name,
+                                           DataTypeTraits<BDataType>::name,
+                                           DataTypeTraits<D0DataType>::name,
+                                           DataTypeTraits<D1DataType>::name,
+                                           DataTypeTraits<AccDataType>::name,
+                                           DataTypeTraits<EDataType>::name,
+                                           ALayout::name,
+                                           BLayout::name,
+                                           D0Layout::name,
+                                           D1Layout::name,
+                                           ELayout::name};
+
+    Setting setting{arg_parser.get_int("warmup"),
+                    arg_parser.get_int("repeat"),
+                    arg_parser.get_bool("timer"),
+                    arg_parser.get_int("verify"),
+                    arg_parser.get_int("init"),
+                    arg_parser.get_bool("log"),
+                    arg_parser.get_str("csv_filename"),
+                    arg_parser.get_bool("flush_cache"),
+                    arg_parser.get_int("rotating_count")};
+
+    auto& profiler = GemmMultiDProfiler::instance(setting);
+
+    try
+    {
+        auto kernel_func = get_kernel_func_by_trait(arg_parser);
+        profiler.benchmark(gemm_multi_d_problem, kernel_func);
+        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Benchmark failed: " << e.what() << std::endl;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, parser] = create_args(argc, argv);
+        if(!result)
+            return EXIT_FAILURE;
+        benchmark_gemm_multi_d(parser);
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << "\n";
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.hpp b/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.hpp
new file mode 100644
index 0000000000..f52d69e374
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.hpp
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+
+#include "gemm_multi_d_host_api.hpp"
+
+struct GemmMultiDProblem
+{
+    int split_k_;
+    int m_, n_, k_;
+    int stride_a_, stride_b_, stride_d0_, stride_d1_, stride_e_;
+
+    std::string dtype_a_, dtype_b_, dtype_d0_, dtype_d1_, dtype_acc_, dtype_e_;
+    std::string layout_a_, layout_b_, layout_d0_, layout_d1_, layout_e_;
+
+    friend std::ostream& operator<<(std::ostream& os, const GemmMultiDProblem& problem)
+    {
+        os << "{\n"
+           << "   \"split_k\":" << problem.split_k_ << ",\n"
+           << "   \"m\":" << problem.m_ << ",\n"
+           << "   \"n\":" << problem.n_ << ",\n"
+           << "   \"k\":" << problem.k_ << ",\n"
+           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
+           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
+           << "   \"stride_d0\":" << problem.stride_d0_ << ",\n"
+           << "   \"stride_d1\":" << problem.stride_d1_ << ",\n"
+           << "   \"stride_e\":" << problem.stride_e_ << ",\n"
+           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
+           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
+           << "   \"dtype_d0\":\"" << problem.dtype_d0_ << "\",\n"
+           << "   \"dtype_d1\":\"" << problem.dtype_d1_ << "\",\n"
+           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
+           << "   \"dtype_e\":\"" << problem.dtype_e_ << "\",\n"
+           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
+           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
+           << "   \"layout_d0\":\"" << problem.layout_d0_ << "\",\n"
+           << "   \"layout_d1\":\"" << problem.layout_d1_ << "\",\n"
+           << "   \"layout_e\":\"" << problem.layout_e_ << "\"\n"
+           << "}";
+        return os;
+    }
+};
+
+struct Setting
+{
+    int n_warmup_;
+    int n_repeat_;
+    bool is_gpu_timer_;
+    int verify_;
+    int init_method_;
+    bool log_;
+    std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
+};
+
+// @brief Function to get the kernel output with reference implementation on CPU
+void gemm_multi_d_host_reference(int verify,
+                                 ck_tile::HostTensor<ADataType>& a_m_k,
+                                 ck_tile::HostTensor<BDataType>& b_k_n,
+                                 ck_tile::HostTensor<D0DataType>& d0_m_n,
+                                 ck_tile::HostTensor<D1DataType>& d1_m_n,
+                                 ck_tile::HostTensor<EDataType>& e_m_n_host_result)
+{
+    if(verify > 0)
+    {
+        // Currently supporting on CPU verification for Gemm Multi D
+        // e_m_n_host_result.SetZero();
+        ck_tile::reference_gemm_multiple_d<ADataType,
+                                           BDataType,
+                                           DsDataType,
+                                           AccDataType,
+                                           EDataType,
+                                           ElementWiseFn>(
+            a_m_k, b_k_n, {d0_m_n, d1_m_n}, e_m_n_host_result);
+    }
+}
+
+enum class Metric
+{
+    LATENCY   = 0,
+    TFLOPS    = 1,
+    BANDWIDTH = 2
+};
+
+inline constexpr auto get_metric_name(Metric m)
+{
+    switch(m)
+    {
+    case Metric::LATENCY: return "latency";
+    case Metric::TFLOPS: return "tflops";
+    case Metric::BANDWIDTH: return "bandwidth";
+    default: throw std::invalid_argument("Unsupported metric type");
+    }
+}
+
+struct PerformanceResult
+{
+    double latency_;
+    double tflops_;
+    double bandwidth_;
+
+    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
+    {
+        switch(m)
+        {
+        case Metric::LATENCY: return a.latency_ < b.latency_;
+        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
+        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
+        default: throw std::invalid_argument("Unsupported metric type");
+        }
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
+    {
+        os << "{\n"
+           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
+           << ",\n"
+           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
+           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct KernelInstance
+{
+    std::string name_;
+    GemmMultiDProblem problem_;
+    PerformanceResult perf_result_;
+
+    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
+    {
+        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
+    {
+        os << "{\n"
+           << " \"name\": \"" << "{\n"
+           << obj.name_ << "\n}" << "\",\n"
+           << " \"problem\": \"" << obj.problem_ << "\",\n"
+           << " \"perf_result\": " << obj.perf_result_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+inline std::string get_rocm_version()
+{
+    std::ifstream version_file("/opt/rocm/.info/version");
+    if(version_file.is_open())
+    {
+        std::string version;
+        std::getline(version_file, version);
+        return version;
+    }
+    return "Unknown";
+}
+
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+
+    const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+        max_accumulated_value, kbatch);
+
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+/// @brief Function to compare the results of the device and host computations
+bool compare(std::string instanceName,
+             ck_tile::index_t K,
+             ck_tile::HostTensor<EDataType>& e_m_n_dev_result,
+             ck_tile::HostTensor<EDataType>& e_m_n_host_result)
+{
+    const float max_accumulated_value =
+        *std::max_element(e_m_n_host_result.mData.begin(), e_m_n_host_result.mData.end());
+
+    const auto rtol_atol = calculate_rtol_atol(K, 1, max_accumulated_value);
+
+    bool pass = ck_tile::check_err(e_m_n_dev_result,
+                                   e_m_n_host_result,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "For " << instanceName << " Relative error threshold is "
+              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
+              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
diff --git a/tile_engine/ops/gemm_multi_d/configs/custom_ci_config.json b/tile_engine/ops/gemm_multi_d/configs/custom_ci_config.json
new file mode 100644
index 0000000000..cd638d9af0
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/configs/custom_ci_config.json
@@ -0,0 +1,80 @@
+{
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        256      ]
+    },
+    "tile_n": {
+      "values": [
+        128
+      ]
+    },
+    "tile_k": {
+      "values": [
+        32
+      ]
+    },
+    "warp_m": {
+      "values": [
+        2
+      ]
+    },
+    "warp_n": {
+      "values": [
+        2
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        16
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "compv3"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave"
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "cshuffle"
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm_multi_d/configs/default_config.json b/tile_engine/ops/gemm_multi_d/configs/default_config.json
new file mode 100644
index 0000000000..6d1afa4425
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/configs/default_config.json
@@ -0,0 +1,84 @@
+{
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        256
+      ]
+    },
+    "tile_n": {
+      "values": [
+        128
+      ]
+    },
+    "tile_k": {
+      "values": [
+        32
+      ]
+    },
+    "warp_m": {
+      "values": [
+        2
+      ]
+    },
+    "warp_n": {
+      "values": [
+        2
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        16
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "compv3",
+        "compv4",
+        "mem"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave",
+        "interwave"
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "cshuffle"
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm_multi_d/configs/user_provided_config.json b/tile_engine/ops/gemm_multi_d/configs/user_provided_config.json
new file mode 100644
index 0000000000..243d858fe5
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/configs/user_provided_config.json
@@ -0,0 +1,81 @@
+{
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        256
+      ]
+    },
+    "tile_n": {
+      "values": [
+        256
+      ]
+    },
+    "tile_k": {
+      "values": [
+        64
+      ]
+    },
+    "warp_m": {
+      "values": [
+        2
+      ]
+    },
+    "warp_n": {
+      "values": [
+        2
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        32
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        32
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        16
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "compv3"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave"      
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "cshuffle"      
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_codegen_utils.py b/tile_engine/ops/gemm_multi_d/gemm_multi_d_codegen_utils.py
new file mode 100644
index 0000000000..9aca3407b1
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_codegen_utils.py
@@ -0,0 +1,228 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+# -*- coding: utf-8 -*-
+
+"""
+Mappings and utility functions for kernel code generation.
+"""
+
+import subprocess
+import re
+from functools import lru_cache
+
+DATA_TYPE_MAP = {
+    "fp32": "float",
+    "fp16": "ck_tile::half_t",
+    "bf16": "ck_tile::bf16_t",
+    "int8": "ck_tile::int8_t",
+    "fp8": "ck_tile::fp8_t",
+    "bf8": "ck_tile::bf8_t",
+    "int4": "ck_tile::pk_int4_t",
+    "int32": "ck_tile::int32_t",
+}
+
+LAYOUT_MAP = {
+    "r": "ck_tile::tensor_layout::gemm::RowMajor",
+    "c": "ck_tile::tensor_layout::gemm::ColumnMajor",
+}
+
+
+# TODO THIS IS NOT SUPPORTED FOR MULTI D AS OF NOW
+# DEFAULT_EPILOGUE = """
+#             using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
+#                                 ck_tile::DefaultGemm2DEpilogueProblem<ADataType,
+#                                                                       BDataType,
+#                                                                       AccDataType,
+#                                                                       CDataType,
+#                                                                       CLayout,
+#                                                                       kPadM,
+#                                                                       kPadN,
+#                                                                       WarpTileM,
+#                                                                       WarpTileN,
+#                                                                       WarpTileK,
+#                                                                       UniversalGemmProblem::TransposeC,
+#                                                                       true,
+#                                                                       memory_operation>>;
+# """
+
+CSHUFFLE_EPILOGUE = """
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                             BDataType,
+                                                             DsDataType,
+                                                             AccDataType,
+                                                             EDataType,
+                                                             DsLayout,
+                                                             ELayout,
+                                                             CDEElementWise,
+                                                             TilePartitioner::MPerBlock,
+                                                             TilePartitioner::NPerBlock,
+                                                             WarpM,
+                                                             WarpN,
+                                                             WarpTileM,
+                                                             WarpTileN,
+                                                             WarpTileK,
+                                                             UniversalGemmProblem::TransposeC,
+                                                             memory_operation>>;
+"""
+
+PIPELINE_MAP = {
+    "mem": ["ck_tile::BaseGemmPipelineAgBgCrMem", "ck_tile::GemmPipelineAgBgCrMem"],
+    "compv3": [
+        "ck_tile::BaseGemmPipelineAgBgCrCompV3",
+        "ck_tile::GemmPipelineAgBgCrCompV3",
+    ],
+    "compv4": [
+        "ck_tile::BaseGemmPipelineAgBgCrCompV4",
+        "ck_tile::GemmPipelineAgBgCrCompV4",
+    ],
+}
+
+SCHEDULER_MAP = {
+    "interwave": "ck_tile::GemmPipelineScheduler::Interwave",
+    "intrawave": "ck_tile::GemmPipelineScheduler::Intrawave",
+}
+
+# EPILOGUE_MAP = {"default": DEFAULT_EPILOGUE, "cshuffle": CSHUFFLE_EPILOGUE}
+
+EPILOGUE_MAP = {"cshuffle": CSHUFFLE_EPILOGUE}
+
+
+def BOOL_MAP(b_):
+    return {True: "true", False: "false"}[bool(b_)]
+
+
+# Can add some more supported combinations
+warp_tile_supported_combinations = {
+    "gfx90a": {
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32]],
+        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32]],
+    },
+    "gfx942": {
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
+        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
+        "int8_int8_int32": [[16, 16, 32], [32, 32, 16]],
+    },
+    "gfx950": {
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [
+            [32, 32, 16],
+            [32, 32, 32],
+            [16, 16, 32],
+            [16, 16, 64],
+            [16, 16, 128],
+            [32, 32, 64],
+        ],
+        "bf8_bf8_fp16": [
+            [32, 32, 16],
+            [32, 32, 32],
+            [16, 16, 64],
+            [16, 16, 32],
+            [16, 16, 128],
+            [32, 32, 64],
+        ],
+    },
+}
+
+# Remove some unsupported combinations
+trait_unsupported_combinations = {
+    ("compv3", "cshuffle", "interwave"),
+    ("compv3", "default", "interwave"),
+    ("compv4", "cshuffle", "interwave"),
+    ("compv4", "default", "interwave"),
+}
+
+
+ELEMENT_SIZE_MAP = {
+    "fp16": 2,
+    "bf16": 2,
+    "int8": 1,
+    "fp8": 1,
+    "bf8": 1,
+    "int4": 0.5,
+    "int32": 4,
+}
+
+
+def element_size(data_type: str) -> float:
+    """Calculate the size (in bytes) of a single element for given data type."""
+    data_type = data_type.lower()
+    if data_type not in ELEMENT_SIZE_MAP:
+        raise ValueError(f"Unsupported data type: {data_type}")
+    return ELEMENT_SIZE_MAP[data_type]
+
+
+GPU_NAME_PATTERN = re.compile(r"Name:\s*(gfx\d+\w*)")
+
+
+@lru_cache(maxsize=1)
+def get_gpu_name_by_id(gpu_id: int = 0) -> str:
+    """Retrieve GPU name (e.g. gfx90a) by device ID"""
+    try:
+        output = subprocess.check_output(
+            ["rocminfo"], text=True, stderr=subprocess.PIPE, timeout=5
+        )
+        if matches := GPU_NAME_PATTERN.finditer(output):
+            gpu_list = [m.group(1) for m in matches]
+            return gpu_list[gpu_id] if gpu_id < len(gpu_list) else ""
+
+        return ""
+
+    except subprocess.CalledProcessError as e:
+        print(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}")
+    except FileNotFoundError:
+        print("ROCm tools not installed (requires rocminfo)")
+    except subprocess.TimeoutExpired:
+        print("GPU query timeout (5s)")
+    except Exception as e:
+        print(f"GPU detection error: {str(e)}")
+
+    return ""
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_config.py b/tile_engine/ops/gemm_multi_d/gemm_multi_d_config.py
new file mode 100644
index 0000000000..e5a879158f
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_config.py
@@ -0,0 +1,250 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+# -*- coding: utf-8 -*-
+
+"""
+Handles loading, parsing, and validation of JSON and Argument configuration parameters.
+"""
+
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Optional, Union, Type
+import json
+
+
+@dataclass
+class EnumConfigParam:
+    """Represents an enumeration-type configuration parameter"""
+
+    values: List[Union[int, str, bool]]
+
+
+@dataclass
+class RangeConfigParam:
+    """Represents a numeric range-type configuration parameter"""
+
+    min: int
+    max: int
+    step: int
+    exclude: Optional[List[int]]
+
+    def generate_candidates(self) -> List[int]:
+        """Generates valid candidates after applying range constraints"""
+
+        if self.min > self.max:
+            raise ValueError(f"Invalid range: min({self.min}) > max({self.max})")
+        if self.step <= 0:
+            raise ValueError(f"Step must be positive, got {self.step}")
+
+        candidates = list(range(self.min, self.max + 1, self.step))
+
+        if hasattr(self, "exclude") and self.exclude:
+            if not isinstance(self.exclude, list):
+                raise TypeError("exclude must be list type")
+            exclude_set = set(self.exclude)
+            candidates = [x for x in candidates if x not in exclude_set]
+
+        if not candidates:
+            raise ValueError(
+                f"No valid candidates for range [{self.min}-{self.max}] "
+                f"with step {self.step} and excludes {self.exclude}"
+            )
+
+        return candidates
+
+
+@dataclass
+class DataType:
+    """Configuration class for data type parameter."""
+
+    a_datatype: str
+    b_datatype: str
+    e_datatype: str
+    d0_datatype: str
+    d1_datatype: str
+    ds_datatype: List[str]
+
+
+@dataclass
+class Layout:
+    """Configuration class for Layout parameter."""
+
+    a_layout: str
+    b_layout: str
+    e_layout: str
+    d0_layout: str
+    d1_layout: str
+    ds_layout: List[str]
+
+
+@dataclass
+class ArgumentConfig:
+    """Configuration class for Argument parameter."""
+
+    datatypes: DataType
+    layouts: Layout
+    function_name: str
+
+    @classmethod
+    def from_args(
+        cls: Type["ArgumentConfig"],
+        datatype: str,
+        layout: str,
+        elementwise_function: str,
+    ) -> "ArgumentConfig":
+        """configuration loader with validation controls"""
+
+        datatypes = DataType(
+            a_datatype=datatype,
+            b_datatype=datatype,
+            e_datatype=datatype,
+            d0_datatype=datatype,
+            d1_datatype=datatype,
+            ds_datatype=[datatype, datatype],
+        )
+
+        layout_parts = layout.lower()
+        assert len(layout_parts) == 4, (
+            f"Invalid layout string: {layout} (must be 4 characters like 'rcrr' where r stands for row major and c stands for column major)"
+        )
+        assert layout_parts[0] in ("r", "c"), (
+            f"Invalid matrix_a layout: {layout_parts[0]} (must be 'r' for row major or or 'c' for column major)"
+        )
+        assert layout_parts[1] in ("r", "c"), (
+            f"Invalid matrix_b layout: {layout_parts[1]} (must be 'r' for row major or or 'c' for column major)"
+        )
+        assert layout_parts[2] == "r", (
+            f"Invalid matrix_e layout: {layout_parts[2]} (must be 'r' only as currently we are supporting only row major)"
+        )
+        assert layout_parts[3] == "r", (
+            f"Invalid D dimension layout: {layout_parts[3]} (must be 'r' only as currently we are supporting only row major)"
+        )
+
+        layouts = Layout(
+            a_layout=layout[0],
+            b_layout=layout[1],
+            e_layout=layout[2],
+            d0_layout=layout[3],
+            d1_layout=layout[3],
+            ds_layout=[layout[3], layout[3]],
+        )
+        # Elementwise function name validation
+        valid_functions = ["mul", "add", "passthrough"]
+        if elementwise_function not in valid_functions:
+            raise ValueError(
+                f"Invalid elementwise function: {elementwise_function}. "
+                f"Valid options are: {', '.join(valid_functions)}"
+            )
+
+        # Set the function name based on the elementwise function
+        if elementwise_function == "mul":
+            function_name = "MultiDMultiply"
+        elif elementwise_function == "add":
+            function_name = "MultiDAdd"
+        elif elementwise_function == "passthrough":
+            function_name = "PassThrough"  # TODO Change this
+
+        return cls(datatypes=datatypes, layouts=layouts, function_name=function_name)
+
+
+@dataclass
+class TileConfig:
+    """Configuration class for tile parameter."""
+
+    tile_m: Union[EnumConfigParam, RangeConfigParam]
+    tile_n: Union[EnumConfigParam, RangeConfigParam]
+    tile_k: Union[EnumConfigParam, RangeConfigParam]
+
+    warp_m: Union[EnumConfigParam, RangeConfigParam]
+    warp_n: Union[EnumConfigParam, RangeConfigParam]
+    warp_k: Union[EnumConfigParam, RangeConfigParam]
+
+    warp_tile_m: Union[EnumConfigParam, RangeConfigParam]
+    warp_tile_n: Union[EnumConfigParam, RangeConfigParam]
+    warp_tile_k: Union[EnumConfigParam, RangeConfigParam]
+
+
+@dataclass
+class TraitConfig:
+    """Configuration class for kernel traits."""
+
+    pipeline: EnumConfigParam
+    scheduler: EnumConfigParam
+    epilogue: EnumConfigParam
+    pad_m: EnumConfigParam
+    pad_n: EnumConfigParam
+    pad_k: EnumConfigParam
+
+
+@dataclass
+class JsonConfig:
+    """Configuration class for JSON parameter."""
+
+    tile_config: TileConfig
+    trait_config: TraitConfig
+
+    @classmethod
+    def from_json(cls: Type["JsonConfig"], filepath: str) -> "JsonConfig":
+        """JSON configuration loader with validation controls"""
+        config_path = Path(filepath)
+
+        try:
+            if not config_path.exists():
+                raise FileNotFoundError(f"Config file {filepath} not found")
+
+            with config_path.open("r") as f:
+                config_dict = json.load(f)
+
+            # Parse tile config
+            def create_param(param_dict):
+                if "values" in param_dict:
+                    return EnumConfigParam(values=param_dict["values"])
+                else:
+                    return RangeConfigParam(
+                        min=param_dict["min"],
+                        max=param_dict["max"],
+                        step=param_dict["step"],
+                        exclude=param_dict.get("exclude", []),
+                    )
+
+            tile_config = TileConfig(
+                tile_m=create_param(config_dict["tile_config"]["tile_m"]),
+                tile_n=create_param(config_dict["tile_config"]["tile_n"]),
+                tile_k=create_param(config_dict["tile_config"]["tile_k"]),
+                warp_m=create_param(config_dict["tile_config"]["warp_m"]),
+                warp_n=create_param(config_dict["tile_config"]["warp_n"]),
+                warp_k=create_param(config_dict["tile_config"]["warp_k"]),
+                warp_tile_m=create_param(config_dict["tile_config"]["warp_tile_m"]),
+                warp_tile_n=create_param(config_dict["tile_config"]["warp_tile_n"]),
+                warp_tile_k=create_param(config_dict["tile_config"]["warp_tile_k"]),
+            )
+
+            # Parse trait config
+            trait_config = TraitConfig(
+                pipeline=EnumConfigParam(
+                    values=config_dict["trait_config"]["pipeline"]["values"]
+                ),
+                scheduler=EnumConfigParam(
+                    values=config_dict["trait_config"]["scheduler"]["values"]
+                ),
+                epilogue=EnumConfigParam(
+                    values=config_dict["trait_config"]["epilogue"]["values"]
+                ),
+                pad_m=EnumConfigParam(
+                    values=config_dict["trait_config"]["pad_m"]["values"]
+                ),
+                pad_n=EnumConfigParam(
+                    values=config_dict["trait_config"]["pad_n"]["values"]
+                ),
+                pad_k=EnumConfigParam(
+                    values=config_dict["trait_config"]["pad_k"]["values"]
+                ),
+            )
+
+            return cls(tile_config=tile_config, trait_config=trait_config)
+
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON format: {str(e)}")
+        except KeyError as e:
+            raise KeyError(f"Missing required configuration field: {str(e)}")
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_host_api.hpp b/tile_engine/ops/gemm_multi_d/gemm_multi_d_host_api.hpp
new file mode 100644
index 0000000000..41fddf30aa
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_host_api.hpp
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstring>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "gemm_multi_d_dispatcher.hpp"
+#include "gemm_multi_d_common.hpp"
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+        .insert("stride_ds", "0", "The stride value for tensor Ds  Default is 0.")
+        .insert("stride_e", "0", "The stride value for tensor E  Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("verify",
+                "1",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
+                "for validation on GPU. Default is 1, validation on CPU, as validation on GPU is "
+                "not supported.")
+        .insert("log",
+                "false",
+                "Wether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert("warmup",
+                "50",
+                "The number of iterations before benchmarking the kernel. Default is 50.")
+        .insert("repeat",
+                "100",
+                "The number of iterations for benchmarking the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Indicates whether the timer is a GPU timer. Possible values are true or false. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "false",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert("csv_filename",
+                "gemm_multi_d_kernel",
+                "The filename of benchmark result. Default is set to gemm_multi_d_kernel.")
+        .insert(
+            "pipeline",
+            "compv3",
+            "The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.")
+        .insert("scheduler",
+                "intrawave",
+                "The type of pipeline. Possible values are compv3, compv4 or mem. Default is "
+                "compv3.")
+        .insert(
+            "epilogue",
+            "cshuffle",
+            "The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.")
+        .insert("pad_m",
+                "false",
+                "Whether pad or not in m direction. Possible values are true or false. Default is "
+                "false.")
+        .insert("pad_n",
+                "false",
+                "Whether pad or not in n direction. Possible values are true or false. Default is "
+                "false.")
+        .insert("pad_k",
+                "false",
+                "Whether pad or not in k direction. Possible values are true or false. Default is "
+                "false.");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+auto get_kernel_func_by_trait(const ck_tile::ArgParser& arg_parser)
+{
+    KernelTraits trait;
+    trait.pipeline  = arg_parser.get_str("pipeline");
+    trait.scheduler = arg_parser.get_str("scheduler");
+    trait.epilogue  = arg_parser.get_str("epilogue");
+    trait.pad_m     = arg_parser.get_bool("pad_m");
+    trait.pad_n     = arg_parser.get_bool("pad_n");
+    trait.pad_k     = arg_parser.get_bool("pad_k");
+
+    return GemmMultiDDispatcher::dispatch(trait);
+}
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py b/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
new file mode 100755
index 0000000000..4b5acf1363
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
@@ -0,0 +1,755 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+# -*- coding: utf-8 -*-
+
+"""
+generate kernel instances to speed up compilation
+"""
+
+import argparse
+import itertools
+from pathlib import Path
+from typing import List, Optional
+from gemm_multi_d_config import JsonConfig, ArgumentConfig, RangeConfigParam
+from gemm_multi_d_codegen_utils import (
+    DATA_TYPE_MAP,
+    LAYOUT_MAP,
+    PIPELINE_MAP,
+    SCHEDULER_MAP,
+    EPILOGUE_MAP,
+    BOOL_MAP,
+    warp_tile_supported_combinations,
+    trait_unsupported_combinations,
+    element_size,
+    get_gpu_name_by_id,
+)
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+
+class GemmMultiDCodeGenerator:
+    """GEMM (General Matrix Multiplication) Multi D code generator."""
+
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        user_provided_config: Optional[JsonConfig] = None,
+    ):
+        self.output_dir = Path(args.working_path)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        if user_provided_config is not None:
+            self.config = user_provided_config
+        else:
+            config_path = (
+                Path(__file__).resolve().parent / "configs" / "default_config.json"
+            )
+            self.config = JsonConfig.from_json(config_path)
+
+        self.args = ArgumentConfig.from_args(
+            args.datatype, args.layout, args.elementwise_function
+        )
+
+        self.valid_trait_names: List[str] = []
+        self.valid_trait_tile_combinations: map[str, list[tuple[int]]] = {}
+
+    def list_all_trait_names(self):
+        """List all possible kernel trait names into file."""
+        w_p = Path(self.output_dir)
+        file_path = w_p / "gemm_multi_d_instance_blobs.txt"
+        self._generate_all_traits()
+        self._get_valid_trait_tile_combinations()
+        file_range_map = {}
+        # Write all file paths to the header file
+        files_listed = 0
+        with file_path.open("w") as f:
+            # Core files
+            core_files = [
+                "gemm_multi_d_common.hpp",
+                "gemm_multi_d_instances.hpp",
+                "gemm_multi_d_dispatcher.hpp",
+            ]
+            for core_file in core_files:
+                f.write(str(w_p / core_file) + "\n")
+                files_listed += 1
+
+            # Trait header files
+            for trait in self.valid_trait_names:
+                trait_file = f"gemm_multi_d_{trait}.hpp"
+                f.write(str(w_p / trait_file) + "\n")
+                files_listed += 1
+            file_name = set()
+            # Instance source files
+            for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+                start_idx = files_listed
+                for tile in tile_valid_params:
+                    for (
+                        tile_m,
+                        tile_n,
+                        tile_k,
+                        warp_m,
+                        warp_n,
+                        warp_k,
+                        _,
+                        _,
+                        _,
+                    ) in tile:
+                        instance_name = f"gemm_multi_d_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}.cpp"
+
+                        if instance_name not in file_name:
+                            file_name.add(instance_name)
+                            f.write(str(w_p / instance_name) + "\n")
+                            files_listed += 1
+
+                file_range_map[trait] = (start_idx, files_listed)
+
+        file_path = w_p / "gemm_multi_d_instance_blobs_range.txt"
+        with file_path.open("w") as f:
+            for name, ranges in file_range_map.items():
+                start, last = ranges
+                f.write(name + " " + f"{start}" + " " + f"{last}" + "\n")
+
+    def _generate_all_traits(self):
+        """Generate all possible kernel traits names."""
+        params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k"]
+
+        # Generate all unique_combinations
+        _unique = set(
+            itertools.product(
+                *[getattr(self.config.trait_config, param).values for param in params]
+            )
+        )
+
+        for combo in _unique:
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = combo
+            current_combination = (pipeline, epilogue, scheduler)
+
+            if current_combination not in trait_unsupported_combinations:
+                trait_name = (
+                    f"{pipeline}_{epilogue}_{scheduler}_"
+                    f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}"
+                )
+                self.valid_trait_names.append(trait_name)
+            else:
+                logging.debug(f"Invalid combination: {pipeline}-{epilogue}-{scheduler}")
+
+    def _get_valid_trait_tile_combinations(self):
+        def get_tile_value(tile_param):
+            return (
+                tile_param.generate_candidates()
+                if isinstance(tile_param, RangeConfigParam)
+                else tile_param.values
+            )
+
+        tile_group = list(
+            itertools.product(
+                get_tile_value(self.config.tile_config.tile_m),
+                get_tile_value(self.config.tile_config.tile_n),
+                get_tile_value(self.config.tile_config.tile_k),
+            )
+        )
+
+        warp_group = list(
+            itertools.product(
+                get_tile_value(self.config.tile_config.warp_m),
+                get_tile_value(self.config.tile_config.warp_n),
+                get_tile_value(self.config.tile_config.warp_k),
+            )
+        )
+
+        warp_tile_group = list(
+            itertools.product(
+                get_tile_value(self.config.tile_config.warp_tile_m),
+                get_tile_value(self.config.tile_config.warp_tile_n),
+                get_tile_value(self.config.tile_config.warp_tile_k),
+            )
+        )
+
+        tile_params = {
+            t + w + wt for t in tile_group for w in warp_group for wt in warp_tile_group
+        }
+
+        for trait in self.valid_trait_names:
+            tile_valid_params = [
+                tile for tile in tile_params if self.is_tile_valid(tile, trait)
+            ]
+
+            if trait not in self.valid_trait_tile_combinations:
+                self.valid_trait_tile_combinations[trait] = []
+            self.valid_trait_tile_combinations[trait].append(tile_valid_params)
+
+    def is_tile_valid(self, tile: tuple, trait: str) -> bool:
+        """Check if the tile configuration is valid for the given trait."""
+        (
+            tile_m,
+            tile_n,
+            tile_k,
+            warp_m,
+            warp_n,
+            warp_k,
+            warp_tile_m,
+            warp_tile_n,
+            warp_tile_k,
+        ) = tile
+        pipeline, *_ = trait.split("_")
+
+        # Parameter validity check
+        invalid_params = []
+        if (warp_m, warp_n, warp_k) not in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]:
+            invalid_params.append(
+                f"warp_m({warp_m}) * warp_n({warp_n}) * warp_k({warp_k})"
+            )
+        if (warp_m * warp_tile_m) == 0:
+            invalid_params.append(f"warp_m({warp_m}) * warp_tile_m({warp_tile_m})")
+        if (warp_n * warp_tile_n) == 0:
+            invalid_params.append(f"warp_n({warp_n}) * warp_tile_n({warp_tile_n})")
+        if (warp_k * warp_tile_k) == 0:
+            invalid_params.append(f"warp_k({warp_k}) * warp_tile_k({warp_tile_k})")
+
+        if invalid_params:
+            logging.debug(
+                f"Trait: [{trait}], Invalid warp configuration: {', '.join(invalid_params)}. "
+                f"Parameter combination: warp=({warp_m},{warp_n},{warp_k}), "
+                f"warp_tile=({warp_tile_m},{warp_tile_n},{warp_tile_k})"
+            )
+            return False
+        # Dimension alignment check
+        alignment_issues = []
+        if tile_m % (warp_m * warp_tile_m) != 0:
+            alignment_issues.append(
+                f"tile_m({tile_m}) % [{warp_m}x{warp_tile_m}] = {tile_m % (warp_m * warp_tile_m)}"
+            )
+        if tile_n % (warp_n * warp_tile_n) != 0:
+            alignment_issues.append(
+                f"tile_n({tile_n}) % [{warp_n}x{warp_tile_n}] = {tile_n % (warp_n * warp_tile_n)}"
+            )
+        if tile_k % (warp_k * warp_tile_k) != 0:
+            alignment_issues.append(
+                f"tile_k({tile_k}) % [{warp_k}x{warp_tile_k}] = {tile_k % (warp_k * warp_tile_k)}"
+            )
+
+        if alignment_issues:
+            logging.debug(
+                f"Trait: [{trait}], Dimension alignment failed: {', '.join(alignment_issues)}. "
+                f"Tile dimensions {tile_m}x{tile_n}x{tile_k} must be divisible by "
+                f"[warp]: {warp_m}x{warp_n}x{warp_k} x [warp_tile]: {warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+            )
+            return False
+
+        # LDS capacity verification
+        matrix_a_size = (tile_m * tile_k) * element_size(self.args.datatypes.a_datatype)
+
+        matrix_b_size = (tile_n * tile_k) * element_size(self.args.datatypes.b_datatype)
+
+        total_tile_in_lds = matrix_a_size + matrix_b_size
+
+        max_tile_size = 2**15 if pipeline == "compv4" else 2**16
+
+        if total_tile_in_lds > max_tile_size:
+            logging.debug(
+                f"LDS capacity exceeded [{trait}]: Total required {total_tile_in_lds:,}B ({total_tile_in_lds / 1024:.1f}KB) > "
+                f"maximum allowed {max_tile_size:,}B ({max_tile_size / 1024}KB). Breakdown:\n"
+                f"- Matrix A ({self.config.problem.datatype_map['matrix_a']}): {tile_m}x{tile_k} = {matrix_a_size:,}B\n"
+                f"- Matrix B ({self.config.problem.datatype_map['matrix_b']}): {tile_n}x{tile_k} = {matrix_b_size:,}B"
+            )
+            return False
+
+        # Warp combination validation
+        warp_tile_key = f"{self.args.datatypes.a_datatype}_{self.args.datatypes.b_datatype}_{self.args.datatypes.e_datatype}"
+
+        current_combination = [warp_tile_m, warp_tile_n, warp_tile_k]
+
+        gpu_name = get_gpu_name_by_id(0)
+
+        gpu_warp_tile_key = warp_tile_supported_combinations.get(gpu_name, {})
+        if not gpu_warp_tile_key:
+            logging.debug(
+                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check."
+            )
+            return False
+
+        allowed_combinations = gpu_warp_tile_key.get(warp_tile_key, [])
+        if not allowed_combinations:
+            logging.debug(
+                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check."
+            )
+            return False
+
+        if current_combination not in allowed_combinations:
+            logging.debug(
+                f"Trait: [{trait}], Invalid warp combination: {current_combination} not in allowed list. "
+                f"Valid combinations for data type '{warp_tile_key}': {allowed_combinations}"
+            )
+            return False
+
+        return True
+
+    def generate_all_instance_files(self):
+        """Generate all kernel instances files."""
+        self._generate_common_header_file()
+        self._generate_all_trait_files()
+        self._generate_dispatcher_file()
+
+    def _generate_common_header_file(self):
+        """Generate common header file with datatypes and layout."""
+
+        acc_type = "float"  # As we are currently supporting only fp16
+
+        content = f"""
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+// Data types
+using ADataType = {DATA_TYPE_MAP[self.args.datatypes.a_datatype]};
+using BDataType = {DATA_TYPE_MAP[self.args.datatypes.b_datatype]};
+using AccDataType = {acc_type};
+using D0DataType = {DATA_TYPE_MAP[self.args.datatypes.d0_datatype]};
+using D1DataType = {DATA_TYPE_MAP[self.args.datatypes.d1_datatype]};
+using DsDataType = ck_tile::tuple<D0DataType, D1DataType>;
+using EDataType = {DATA_TYPE_MAP[self.args.datatypes.e_datatype]};
+
+
+// Layout configurations
+using ALayout = {LAYOUT_MAP[self.args.layouts.a_layout]};
+using BLayout = {LAYOUT_MAP[self.args.layouts.b_layout]};
+using D0Layout = {LAYOUT_MAP[self.args.layouts.d0_layout]};
+using D1Layout = {LAYOUT_MAP[self.args.layouts.d1_layout]};
+using DsLayout = ck_tile::tuple<D0Layout, D1Layout>;
+using ELayout = {LAYOUT_MAP[self.args.layouts.e_layout]};
+
+// Element-wise function for D
+using ElementWiseFn = ck_tile::element_wise::{self.args.function_name};
+
+"""
+
+        (self.output_dir / "gemm_multi_d_common.hpp").write_text(content)
+
+    def _generate_all_trait_files(self):
+        """Generate all kernel traits into files."""
+        if not self.valid_trait_names:
+            self._generate_all_traits()
+            self._get_valid_trait_tile_combinations()
+        for trait in self.valid_trait_names:
+            self._generate_trait_file(trait)
+        self._generate_instantiation_source_files()
+        self._generate_common_instance_header_file()
+
+    def _generate_trait_file(self, trait: str):
+        """Generate a trait with all tile/warp combinations."""
+        pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = trait.split("_")
+        filename = f"gemm_multi_d_{trait}.hpp"
+
+        content = f"""
+#pragma once
+
+#include "gemm_multi_d_common.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/host.hpp"
+
+namespace {trait} {{
+"""
+        # Add template struct with configuration
+        content += self._generate_kernel_struct(
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k
+        )
+
+        content += f"\n}} // namespace {trait}\n"
+        (self.output_dir / filename).write_text(content)
+
+    def _generate_kernel_struct(
+        self,
+        pipeline: str,
+        epilogue: str,
+        scheduler: str,
+        pad_m: str,
+        pad_n: str,
+        pad_k: str,
+    ) -> str:
+        """Generate the code block of kernel struct"""
+        return f"""
+
+template <int TileM, int TileN, int TileK,
+          int WarpM, int WarpN, int WarpK,
+          int WarpTileM, int WarpTileN, int WarpTileK,
+          typename CDEElementWise = ElementWiseFn>
+struct GemmKernelMultiD {{
+    static constexpr bool kPadM = {pad_m};
+    static constexpr bool kPadN = {pad_n};
+    static constexpr bool kPadK = {pad_k};
+
+    static float launch(ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args, const ck_tile::stream_config& stream) {{
+        static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
+        
+        static constexpr bool TransposeC = false;
+
+        static constexpr int kBlockPerCu                         = 1;
+        static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        static constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<TileM, TileN, TileK>,
+                                   ck_tile::sequence<WarpM, WarpN, WarpK>,
+                                   ck_tile::sequence<WarpTileM, WarpTileN, WarpTileK>>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                      TileParitionerGroupNum,
+                                                      TileParitionerM01>;
+
+        using Traits  =
+            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, ELayout>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
+                                             ALayout, BLayout, ELayout, TransposeC>;
+
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = {PIPELINE_MAP[pipeline][0]}<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain     = args.k_batch * TileK;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * TileK;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        float ave_time{{0}};
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {{
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+            constexpr auto scheduler      = {SCHEDULER_MAP[scheduler]};
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem =
+                ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                      BDataType,
+                                                      AccDataType,
+                                                      GemmShape,
+                                                      GemmUniversalTraits,
+                                                      scheduler,
+                                                      has_hot_loop_v,
+                                                      tail_number_v>;
+
+            using GemmPipeline = {PIPELINE_MAP[pipeline][1]}<UniversalGemmProblem>;
+            {EPILOGUE_MAP[epilogue]}
+            using Kernel = ck_tile::GemmKernelMultiD<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+            const dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {{
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
+            }}
+
+            if(stream.log_level_ > 0)
+            {{
+                std::cout << "Launching kernel with args:"
+                      << " grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}"
+                      << ", blocks: {{" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}}"
+                      << std::endl;
+            }}
+
+            ave_time = ck_tile::launch_kernel(stream,
+                                          ck_tile::make_kernel<kBlockPerCu>(
+                                              Kernel{{}}, grids, blocks, 0, kargs));
+                
+            return ave_time;
+
+        }};
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {{
+            if(args.k_batch == 1) {{
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                            ck_tile::memory_operation_enum::set>{{}});
+            }} else {{
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                            ck_tile::memory_operation_enum::atomic_add>{{}});
+            }}
+        }};
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+
+        return ave_time;
+    }}
+
+    static std::string get_name() {{
+        return std::string("gemm_multi_d_") + std::to_string(TileM) + "x" + std::to_string(TileN) + "x" + std::to_string(TileK) +
+                "_" + std::to_string(WarpM) + "x" + std::to_string(WarpN) + "x" + std::to_string(WarpK) + "_" +
+                std::to_string(WarpTileM) + "x" + std::to_string(WarpTileN) + "x" + std::to_string(WarpTileK) + "_" +
+                "{pad_m}" + "_" +
+                "{pad_n}" + "_" +
+                "{pad_k}" + "_" +
+                "{pipeline}" + "_" +
+                "{epilogue}" + "_" +
+                "{scheduler}";
+    }}
+}};
+"""
+
+    def _generate_instantiation_source_files(self):
+        """Generate kernel instance instantiation source files"""
+        tile_map = {}
+        for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+            for tile in tile_valid_params:
+                for (
+                    tile_m,
+                    tile_n,
+                    tile_k,
+                    warp_m,
+                    warp_n,
+                    warp_k,
+                    warp_tile_m,
+                    warp_tile_n,
+                    warp_tile_k,
+                ) in tile:
+                    key = f"{tile_m}x{tile_n}x{tile_k}x{warp_m}x{warp_n}x{warp_k}"
+                    value = f"{warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+                    if key not in tile_map:
+                        tile_map[key] = set()
+                    tile_map[key].add(value)
+
+        files_listed = 0
+        for trait, _ in self.valid_trait_tile_combinations.items():
+            for block_tile, warp_tiles in tile_map.items():
+                tile_m, tile_n, tile_k, warp_m, warp_n, warp_k = map(
+                    int, block_tile.split("x")
+                )
+
+                content = f"""
+#include "gemm_multi_d_{trait}.hpp" 
+
+"""
+                for warp_tile in warp_tiles:
+                    warp_tile_m, warp_tile_n, warp_tile_k = map(
+                        int, warp_tile.split("x")
+                    )
+
+                    files_listed = files_listed + 1
+                    content = (
+                        content
+                        + f"""
+template struct {trait}::GemmKernelMultiD<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}>;"""
+                    )
+                content += """
+"""
+                (
+                    self.output_dir
+                    / f"gemm_multi_d_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}.cpp"
+                ).write_text(content)
+        print(f"Generated {files_listed} kernel instances in total.")
+
+    def _generate_common_instance_header_file(self):
+        """Generate common instance header into file."""
+        content = """
+#pragma once
+"""
+        for trait in self.valid_trait_names:
+            content += f'#include "gemm_multi_d_{trait}.hpp"\n'
+        (self.output_dir / "gemm_multi_d_instances.hpp").write_text(content)
+
+    def _generate_dispatcher_file(self):
+        """Generate the code block of dispatch mechanism."""
+        content = """
+#pragma once
+
+#include <unordered_map>
+#include <functional>
+#include <vector>
+
+#include "gemm_multi_d_common.hpp"
+#include "gemm_multi_d_instances.hpp"
+
+/// @brief Defines the configuration parameters for a GEMM Multi D operation, enabling the selection of a
+/// specific kernel instance based on the provided settings.
+struct KernelTraits
+{
+    /// @brief The name of the pipeline.
+    std::string pipeline;
+    /// @brief The name of the scheduler (e.g., "intrawave", "interwave").
+    std::string scheduler;
+    /// @brief The name of the epilogue (e.g., "cshuffle", "default").
+    std::string epilogue;
+    /// @brief Indicates whether padding is applied to the M dimension.
+    bool pad_m;
+    /// @brief Indicates whether padding is applied to the N dimension.
+    bool pad_n;
+    /// @brief Indicates whether padding is applied to the K dimension.
+    bool pad_k;
+};
+
+struct GemmMultiDDispatcher {
+    static auto& get_kernel_map() {
+        // Use a static local variable
+        static std::unordered_map<
+            std::string,
+            std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmMultiDHostArgs<DsDataType::size()>&, const ck_tile::stream_config&)>>>
+            kernel_map;
+        return kernel_map;
+    }
+
+    static void init() {
+        auto& kernel_map = get_kernel_map();
+        if(!kernel_map.empty()) return;
+        \n"""
+
+        for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+            content += f"""         kernel_map["{trait}"] = {{"""
+            for _, tile in enumerate(tile_valid_params):
+                for j in range(len(tile)):
+                    (
+                        tile_m,
+                        tile_n,
+                        tile_k,
+                        warp_m,
+                        warp_n,
+                        warp_k,
+                        warp_tile_m,
+                        warp_tile_n,
+                        warp_tile_k,
+                    ) = tile[j]
+                    content += """[=](ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args, const ck_tile::stream_config& stream) { """
+
+                    content += f"""
+                        return run_kernel<{trait}::GemmKernelMultiD<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}>>(args, stream);"""
+
+                    if j == len(tile) - 1:
+                        content += """
+                                } """
+                    else:
+                        content += """
+                                }, """
+            content += """
+            };\n """
+
+        content += """    }
+
+    template <typename Kernel>
+    static std::tuple<std::string, float> run_kernel(ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args, const ck_tile::stream_config& stream)
+    {
+        std::string name = Kernel::get_name();
+        float avg_time = Kernel::launch(args, stream);
+        
+        return std::make_tuple(name, avg_time);
+    }
+    
+    
+    static auto dispatch(const KernelTraits& trait) {
+        init();
+        const std::string key = assemble_key(trait);
+        auto& kernel_map = get_kernel_map();
+        if(auto it = kernel_map.find(key); it != kernel_map.end())
+        {
+            return it->second;
+        }
+        throw std::runtime_error("No suitable kernel found: " + key);
+    }
+
+private:
+    static std::string assemble_key(const KernelTraits &trait) {
+        return std::string(trait.pipeline) + "_" +
+               trait.epilogue + "_" +
+               trait.scheduler + "_" +
+               (trait.pad_m ? "true" : "false") + "_" +
+               (trait.pad_n ? "true" : "false") + "_" +
+               (trait.pad_k ? "true" : "false");
+    }
+};
+
+"""
+        (self.output_dir / "gemm_multi_d_dispatcher.hpp").write_text(content)
+
+
+def do_list_blobs(
+    args: argparse.Namespace, user_provide_config: Optional[JsonConfig] = None
+):
+    generator = GemmMultiDCodeGenerator(args, user_provide_config)
+    generator.list_all_trait_names()
+
+
+def do_gen_blobs(
+    args: argparse.Namespace, user_provide_config: Optional[JsonConfig] = None
+):
+    generator = GemmMultiDCodeGenerator(args, user_provide_config)
+    generator.generate_all_instance_files()
+
+
+def main(args):
+    gemm_multi_d_config = JsonConfig.from_json(args.config_json)
+
+    if args.list_blobs:
+        do_list_blobs(args, gemm_multi_d_config)
+    elif args.gen_blobs:
+        do_gen_blobs(args, gemm_multi_d_config)
+    else:
+        logging.warning(
+            "No mode specified (use --list_blobs or --gen_blobs). Generating by default..."
+        )
+        do_gen_blobs(args, gemm_multi_d_config)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK gemm multi D kernel",
+    )
+    parser.add_argument(
+        "-w",
+        "--working_path",
+        default="./",
+        required=False,
+        help="The path where all the blobs are going to be generated",
+    )
+    parser.add_argument(
+        "-j",
+        "--config_json",
+        required=False,
+        help="Path to the json which contains the configurations that user provide",
+    )
+    parser.add_argument(
+        "-d",
+        "--datatype",
+        required=True,
+        help="Specify what datatype to use for the kernel generation, e.g. fp16",
+    )
+    parser.add_argument(
+        "-ly",
+        "--layout",
+        required=True,
+        help="Specify what layout to use for the kernel generation, e.g. rcrr, rrrr",
+    )
+    parser.add_argument(
+        "-ef",
+        "--elementwise_function",
+        required=True,
+        help="Specify what element wise function for D, e.g. mul, add, passthrough",
+    )
+    parser.add_argument(
+        "-l",
+        "--list_blobs",
+        action="store_true",
+        help="List all kernel instances to file",
+    )
+    parser.add_argument(
+        "-g",
+        "--gen_blobs",
+        action="store_true",
+        help="Generate all kernel instances into different files",
+    )
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp b/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp
new file mode 100644
index 0000000000..0106d76c05
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "benchmark_gemm_multi_d.hpp"
+
+class GemmMultiDProfiler
+{
+    public:
+    static GemmMultiDProfiler& instance(Setting setting)
+    {
+        static GemmMultiDProfiler instance{setting};
+        return instance;
+    }
+
+    void benchmark(
+        GemmMultiDProblem& gemm_multi_d_problem,
+        std::vector<std::function<std::tuple<std::string, float>(
+            ck_tile::GemmMultiDHostArgs<DsDataType::size()>&, const ck_tile::stream_config&)>>&
+            callables)
+    {
+        const ALayout layout_a   = ALayout{};
+        const BLayout layout_b   = BLayout{};
+        const D0Layout layout_d0 = D0Layout{};
+        const D1Layout layout_d1 = D1Layout{};
+        const ELayout layout_e   = ELayout{};
+
+        gemm_multi_d_problem.stride_a_ = ck_tile::get_default_stride(gemm_multi_d_problem.m_,
+                                                                     gemm_multi_d_problem.k_,
+                                                                     gemm_multi_d_problem.stride_a_,
+                                                                     is_row_major(layout_a));
+        gemm_multi_d_problem.stride_b_ = ck_tile::get_default_stride(gemm_multi_d_problem.k_,
+                                                                     gemm_multi_d_problem.n_,
+                                                                     gemm_multi_d_problem.stride_b_,
+                                                                     is_row_major(layout_b));
+        gemm_multi_d_problem.stride_d0_ =
+            ck_tile::get_default_stride(gemm_multi_d_problem.m_,
+                                        gemm_multi_d_problem.n_,
+                                        gemm_multi_d_problem.stride_d0_,
+                                        is_row_major(layout_d0));
+        gemm_multi_d_problem.stride_d1_ =
+            ck_tile::get_default_stride(gemm_multi_d_problem.m_,
+                                        gemm_multi_d_problem.n_,
+                                        gemm_multi_d_problem.stride_d1_,
+                                        is_row_major(layout_d1));
+        gemm_multi_d_problem.stride_e_ = ck_tile::get_default_stride(gemm_multi_d_problem.m_,
+                                                                     gemm_multi_d_problem.n_,
+                                                                     gemm_multi_d_problem.stride_e_,
+                                                                     is_row_major(layout_e));
+
+        ck_tile::HostTensor<ADataType> a_m_k(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.k_,
+                                            gemm_multi_d_problem.stride_a_,
+                                            is_row_major(layout_a)));
+        ck_tile::HostTensor<BDataType> b_k_n(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.k_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_b_,
+                                            is_row_major(layout_b)));
+        ck_tile::HostTensor<D0DataType> d0_m_n(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_d0_,
+                                            is_row_major(layout_d0)));
+        ck_tile::HostTensor<D1DataType> d1_m_n(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_d1_,
+                                            is_row_major(layout_d1)));
+        ck_tile::HostTensor<EDataType> e_m_n_device_result(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_e_,
+                                            is_row_major(layout_e)));
+
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        ck_tile::FillUniformDistribution<D0DataType>{-1.f, 1.f}(d0_m_n);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(d1_m_n);
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d0_m_n_dev_buf(d0_m_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d1_m_n_dev_buf(d1_m_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem e_m_n_dev_buf(e_m_n_device_result.get_element_space_size_in_bytes());
+
+        a_m_k_dev_buf.ToDevice(a_m_k.mData.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.mData.data());
+        d0_m_n_dev_buf.ToDevice(d0_m_n.mData.data());
+        d1_m_n_dev_buf.ToDevice(d1_m_n.mData.data());
+
+        e_m_n_dev_buf.SetZero();
+        e_m_n_device_result.SetZero();
+
+        std::array<const void*, DsDataType::size()> ds_ptr_buf = {d0_m_n_dev_buf.GetDeviceBuffer(),
+                                                                  d1_m_n_dev_buf.GetDeviceBuffer()};
+
+        std::array<ck_tile::index_t, DsDataType::size()> stridesDs = {
+            gemm_multi_d_problem.stride_d0_, gemm_multi_d_problem.stride_d1_};
+
+        ck_tile::GemmMultiDHostArgs<DsDataType::size()> gemm_multi_d_args = {
+            a_m_k_dev_buf.GetDeviceBuffer(),
+            b_k_n_dev_buf.GetDeviceBuffer(),
+            ds_ptr_buf,
+            e_m_n_dev_buf.GetDeviceBuffer(),
+            gemm_multi_d_problem.split_k_,
+            gemm_multi_d_problem.m_,
+            gemm_multi_d_problem.n_,
+            gemm_multi_d_problem.k_,
+            gemm_multi_d_problem.stride_a_,
+            gemm_multi_d_problem.stride_b_,
+            stridesDs,
+            gemm_multi_d_problem.stride_e_,
+        };
+
+        ck_tile::HostTensor<EDataType> e_m_n_host_result(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_e_,
+                                            is_row_major(layout_e)));
+
+        if(setting_.verify_)
+        {
+            gemm_multi_d_host_reference(
+                setting_.verify_, a_m_k, b_k_n, d0_m_n, d1_m_n, e_m_n_host_result);
+        }
+
+        for(auto& callable : callables)
+        {
+            auto kernel_run_result =
+                callable(gemm_multi_d_args,
+                         ck_tile::stream_config{
+                             nullptr, true, setting_.log_, setting_.n_warmup_, setting_.n_repeat_});
+
+            auto [kernel_name, execution_time] = kernel_run_result;
+
+            process_result(gemm_multi_d_problem,
+                           e_m_n_dev_buf,
+                           e_m_n_host_result,
+                           e_m_n_device_result,
+                           kernel_run_result);
+        }
+    }
+
+    void process_result(const GemmMultiDProblem& gemm_multi_d_problem,
+                        ck_tile::DeviceMem& e_m_n_dev_buf,
+                        ck_tile::HostTensor<EDataType>& e_m_n_host_result,
+                        ck_tile::HostTensor<EDataType>& e_m_n_dev_result,
+                        const std::tuple<std::string, float>& kernel_run_result)
+    {
+        auto [name, avg_time] = kernel_run_result;
+
+        KernelInstance kernel_instance{name, gemm_multi_d_problem, {-1.0f, -1.0f, -1.0f}};
+
+        static constexpr ck_tile::index_t NumDTensor = DsDataType::size();
+        std::size_t flop = 0, num_byte = 0;
+        flop += std::size_t(2) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_ *
+                gemm_multi_d_problem.k_;
+        ck_tile::static_for<0, NumDTensor, 1>{}([&](auto i) {
+            num_byte += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
+                        gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
+            flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
+                    gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
+        });
+        num_byte += sizeof(ADataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.k_ +
+                    sizeof(BDataType) * gemm_multi_d_problem.k_ * gemm_multi_d_problem.n_ +
+                    sizeof(EDataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
+
+        kernel_instance.perf_result_.latency_   = avg_time;
+        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
+        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
+
+        if(setting_.log_ > 0)
+        {
+            std::cout << kernel_instance << std::endl;
+        }
+
+        e_m_n_dev_buf.FromDevice(e_m_n_dev_result.data());
+        bool verified_correct =
+            !setting_.verify_ ||
+            compare(name, gemm_multi_d_problem.k_, e_m_n_dev_result, e_m_n_host_result);
+
+        if(verified_correct)
+        {
+            kernel_instances_.emplace_back(kernel_instance);
+        }
+        else
+        {
+            std::cout << "Verification failed, skip kernel: " << name << std::endl;
+        }
+
+        e_m_n_dev_buf.SetZero();
+        e_m_n_dev_result.SetZero();
+    }
+
+    KernelInstance select_best_instance(Metric metric)
+    {
+        if(kernel_instances_.empty())
+            throw std::runtime_error("Empty instances");
+
+        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
+                                                 kernel_instances_.end(),
+                                                 [metric](const auto& a, const auto& b) {
+                                                     return PerformanceResult::compare(
+                                                         b.perf_result_, a.perf_result_, metric);
+                                                 });
+
+        std::cout << "**********************************" << std::endl;
+        std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
+                  << "The best kernel instance is: " << kernel_instance << std::endl;
+        std::cout << "**********************************" << std::endl;
+
+        if(!setting_.csv_filename_.empty())
+        {
+            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
+
+            if(!file.is_open())
+            {
+                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
+            }
+            else
+            {
+                if(file.tellp() == 0)
+                {
+                    file << "rocm_version,device_name,"
+                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
+                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
+                         << "structured_sparsity," << "name,"
+                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
+                }
+
+                const auto& problem = kernel_instance.problem_;
+                const auto& name    = kernel_instance.name_;
+                const auto& perf    = kernel_instance.perf_result_;
+
+                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
+                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
+                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
+                     << problem.stride_d0_ << "," << problem.stride_d1_ << "," << problem.stride_e_
+                     << "," << problem.dtype_a_ << "," << problem.dtype_b_ << ","
+                     << problem.dtype_d0_ << "," << problem.dtype_d1_ << "," << problem.dtype_acc_
+                     << "," << problem.dtype_e_ << "," << problem.layout_a_ << ","
+                     << problem.layout_b_ << "," << problem.layout_d0_ << "," << problem.layout_d1_
+                     << "," << problem.layout_e_ << "," << "," << name << "," << std::fixed
+                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
+                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
+                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
+                     << "\n";
+
+                if(!file)
+                {
+                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
+                }
+            }
+        }
+
+        return kernel_instance;
+    }
+
+    GemmMultiDProfiler(const GemmMultiDProfiler&)            = delete;
+    GemmMultiDProfiler& operator=(const GemmMultiDProfiler&) = delete;
+
+    private:
+    ~GemmMultiDProfiler() { kernel_instances_.clear(); }
+    GemmMultiDProfiler(Setting setting) : setting_(setting) {}
+
+    Setting setting_;
+
+    std::vector<KernelInstance> kernel_instances_;
+};