Merge branch 'develop' into jakpiase/gemm_pipeline_mem_skip_lds

2026-07-03 05:37:34 +00:00 · 2025-10-02 19:00:36 +00:00
parent c07cec8809 ef43078788
commit ae4fa7bfe8
1369 changed files with 94454 additions and 22245 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @shumway @vidyasagar-amd
+* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd
 # Documentation files
-docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
-*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
-*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
-.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
+docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
+*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
+*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
+.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
+library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd
--- a/.github/scripts/therock_configure_ci.py
+++ b/.github/scripts/therock_configure_ci.py
@@ -42,6 +42,24 @@ def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
            file=sys.stderr,
        )
        return None
+    
+GITHUB_WORKFLOWS_CI_PATTERNS = [
+    "therock*",
+]
+
+def is_path_workflow_file_related_to_ci(path: str) -> bool:
+    return any(
+        fnmatch.fnmatch(path, ".github/workflows/" + pattern)
+        for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
+    ) or any(
+        fnmatch.fnmatch(path, ".github/scripts/" + pattern)
+        for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
+    )
+
+def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
+    if paths is None:
+        return False
+    return any(is_path_workflow_file_related_to_ci(p) for p in paths)

 # Paths matching any of these patterns are considered to have no influence over
 # build or test workflows so any related jobs can be skipped if all paths
@@ -82,12 +100,16 @@ def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
    )
    other_paths = paths_set - github_workflows_paths

+    related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
    contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)

    print("should_ci_run_given_modified_paths findings:")
    print(f"  contains_other_non_skippable_files: {contains_other_non_skippable_files}")

-    if contains_other_non_skippable_files:
+    if related_to_ci:
+        print("Enabling build jobs since a related workflow file was modified")
+        return True
+    elif contains_other_non_skippable_files:
        print("Enabling TheRock CI jobs since a non-skippable path was modified")
        return True
    else:
--- a/.github/workflows/therock-ci-linux.yml
+++ b/.github/workflows/therock-ci-linux.yml
@@ -27,30 +27,35 @@ jobs:
      TEATIME_FORCE_INTERACTIVE: 0
      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
    steps:
+      - name: "Checking out repository for rocm-libraries"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/rocm-libraries"
+
      - name: Checkout composable_kernel repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          path: "composable_kernel"

      - name: Checkout TheRock repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "ROCm/TheRock"
-          ref: ec1c2ef4f2636bce7733fd8c95e1dbb6692c8a57
+          ref: 409f43ad9d564454bb1b23f8c8aa15d6b9d25200
          path: "TheRock"

      - name: Runner Health Settings
        run: |
-          df -h
-          cmake --version
-          echo "Installed Python versions:"
-          ls -d /opt/python
-          echo "python: $(which python), python3: $(which python3)"
-          echo "Git version: $(git --version)"
-          git config --global --add safe.directory $PWD
-          git config fetch.parallel 10
+          ./TheRock/build_tools/health_status.py
      
      - name: Fetch sources
        run: |
-          ./TheRock/build_tools/fetch_sources.py --jobs 12
+          ./TheRock/build_tools/fetch_sources.py --jobs 12 --no-include-rocm-libraries --no-include-ml-frameworks
+
+      - name: Patch rocm-libraries
+        run: |
+          git config --global --add safe.directory '*'
+          git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch

      - name: Install python deps
        run: |
@@ -92,32 +97,14 @@ jobs:
          aws-region: us-east-2
          role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external

-      - name: Create Logs index Files and upload logs
+      - name: Post Build Upload
        if: always()
        run: |
-          python3 TheRock/build_tools/github_actions/create_log_index.py \
-            --build-dir=TheRock/build \
-            --amdgpu-family=${{ env.AMDGPU_FAMILIES }}
-
-          python3 TheRock/build_tools/github_actions/upload_build_logs_to_s3.py \
-            --build-dir=TheRock/build \
-            --run-id ${{ github.run_id }} \
-            --amdgpu-family ${{ env.AMDGPU_FAMILIES }}
-
-      - name: Upload artifacts
-        run: |
-          python TheRock/build_tools/github_actions/upload_build_artifacts.py \
+          python3 TheRock/build_tools/github_actions/post_build_upload.py \
            --run-id ${{ github.run_id }} \
            --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
-            --build-dir TheRock/build
-
-      - name: Add Links to Job Summary
-        if: always()
-        run: |
-          python TheRock/build_tools/github_actions/upload_build_summary.py \
-            --run-id ${{ github.run_id }} \
-            --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
-            --build-dir TheRock/build
+            --build-dir TheRock/build \
+            --upload

  therock-test-linux:
    name: "Test"
--- a/.github/workflows/therock-ci.yml
+++ b/.github/workflows/therock-ci.yml
@@ -56,7 +56,14 @@ jobs:
    uses: ./.github/workflows/therock-ci-linux.yml
    secrets: inherit
    with:
-      cmake_options: "-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON -DTHEROCK_ENABLE_MIOPEN=ON -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_USE_EXTERNAL_CK=ON -DTHEROCK_CK_SOURCE_DIR=../"
+      cmake_options: >-
+        -DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON 
+        -DTHEROCK_ENABLE_MIOPEN=ON 
+        -DTHEROCK_ENABLE_ALL=OFF 
+        -DTHEROCK_USE_EXTERNAL_COMPOSABLE_KERNEL=ON 
+        -DTHEROCK_COMPOSABLE_KERNEL_SOURCE_DIR=../composable_kernel
+        -DTHEROCK_USE_EXTERNAL_ROCM_LIBRARIES=ON
+        -DTHEROCK_ROCM_LIBRARIES_SOURCE_DIR=../
      amdgpu_families: "gfx94X-dcgpu"
      test_runs_on: "linux-mi325-1gpu-ossci-rocm"

--- a/.github/workflows/therock-test-component.yml
+++ b/.github/workflows/therock-test-component.yml
@@ -0,0 +1,71 @@
+name: Test component
+
+on:
+  workflow_call:
+    inputs:
+      artifact_run_id:
+        type: string
+        default: ""
+      amdgpu_families:
+        type: string
+      test_runs_on:
+        type: string
+      platform:
+        type: string
+      component:
+        type: string
+
+
+permissions:
+  contents: read
+
+jobs:
+  test_component:
+    name: 'Test ${{ fromJSON(inputs.component).job_name }} (shard ${{ matrix.shard }} of ${{ fromJSON(inputs.component).total_shards }})'
+    runs-on: ${{ inputs.test_runs_on }}
+    container:
+      image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' || null }}
+      options: --ipc host
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --group-add 992
+        --env-file /etc/podinfo/gha-gpu-isolation-settings
+    strategy:
+      fail-fast: false
+      matrix:
+        # The shard array is based on "total_shards" from "fetch_test_configurations.py"
+        # The test executable will shard based on the array. (ex: [1, 2, 3, 4] = four test shards)
+        shard: ${{ fromJSON(inputs.component).shard_arr }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
+      OUTPUT_ARTIFACTS_DIR: "./build"
+      THEROCK_BIN_DIR: "./build/bin"
+      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: "ROCm/TheRock"
+
+      - name: Run setup test environment workflow
+        uses: './.github/actions/setup_test_environment'
+        with:
+          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
+          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
+          VENV_DIR: ${{ env.VENV_DIR }}
+          FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }}
+          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+
+      - name: Test
+        timeout-minutes: ${{ fromJSON(inputs.component).timeout_minutes }}
+        env:
+          SHARD_INDEX: ${{ matrix.shard }}
+          TOTAL_SHARDS: ${{ fromJSON(inputs.component).total_shards }}
+        run: |
+          ${{ fromJSON(inputs.component).test_script }}
--- a/.github/workflows/therock-test-packages.yml
+++ b/.github/workflows/therock-test-packages.yml
@@ -37,41 +37,17 @@ jobs:

  test_components:
    name: 'Test ${{ matrix.components.job_name }}'
-    runs-on: ${{ inputs.test_runs_on }}
-    needs: configure_test_matrix
+    needs: [configure_test_matrix]
    # skip tests if no test matrix to run
    if: ${{ needs.configure_test_matrix.outputs.components != '[]' }}
    strategy:
      fail-fast: false
      matrix:
        components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }}
-    defaults:
-      run:
-        shell: bash
-    env:
-      VENV_DIR: ${{ github.workspace }}/.venv
-      ARTIFACT_RUN_ID: "${{ github.run_id }}"
-      OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build
-      THEROCK_BIN_DIR: "./build/bin"
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: "ROCm/TheRock"
-
-      - name: Run setup test environment workflow
-        uses: './.github/actions/setup_test_environment'
-        with:
-          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
-          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
-          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
-          VENV_DIR: ${{ env.VENV_DIR }}
-          FETCH_ARTIFACT_ARGS: ${{ matrix.components.fetch_artifact_args }}
-          PLATFORM: ${{ inputs.platform }}
-          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
-
-      - name: Test
-        timeout-minutes: ${{ matrix.components.timeout_minutes }}
-        run: |
-          if [ "${{ inputs.PLATFORM }}" == "linux" ]; then source ${VENV_DIR}/bin/activate ; else . ${VENV_DIR}/Scripts/activate ; fi
-          ${{ matrix.components.test_script }}
+    uses: './.github/workflows/therock-test-component.yml'
+    with:
+      artifact_run_id: ${{  github.run_id }}
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      test_runs_on: ${{ inputs.test_runs_on }}
+      platform: ${{ inputs.platform }}
+      component: ${{ toJSON(matrix.components) }}
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,12 +6,12 @@ repos:
        entry: clang-format-18 -i --style=file
        language: system
        types_or: [c++, inc]
-    -   id: copyright-year-checker
-        name: copyright-year-checker
-        entry: script/check_copyright_year.sh
-        verbose: false
-        language: script
-        types: [c++]
+    # -   id: copyright-year-checker
+    #     name: copyright-year-checker
+    #     entry: script/check_copyright_year.sh
+    #     verbose: false
+    #     language: script
+    #     types: [c++]
    -   id: remove-exec-bit
        name: Remove executable bit from non-executable files
        entry: script/remove_exec_bit.sh
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,9 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 ## Composable Kernel 1.2.0 for ROCm 7.0.0

 ### Added
-
+* Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
+* Added the new api to load different memory sizes to SGPR.
+* Added support for B Tensor Preshuffle in CK TILE Grouped GEMM.
 * Added a basic copy kernel example and supporting documentation for new CK Tile developers.
 * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
 * Added a fully asynchronous HOST (CPU) arguments copy flow for CK grouped GEMM kernels.
@@ -15,6 +17,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
 * Added support for Multiple D GEMM
+* Added support for Multiple ABD GEMM
 * Added GEMM pipeline for microscaling (MX) FP8/FP6/FP4 data types
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
@@ -28,7 +31,10 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for elementwise kernel.
 * Added benchmarking support for tile engine GEMM Multi D.
 * Added block scaling support in CK_TILE GEMM, allowing flexible use of quantization matrices from either A or B operands.
-* Added support for skipping LDS to universal GEMM
+* Added the row-wise column-wise quantization for CK_TILE GEMM & CK_TILE Grouped GEMM.
+* Added support for f32 to FMHA (fwd/bwd).
+* Added tensor-wise quantization for CK_TILE GEMM.
+* Added support for skipping LDS to universal GEMM (no A column-wise, B row-wise support)

 ### Optimized

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -220,7 +220,10 @@ rocm_check_target_ids(SUPPORTED_GPU_TARGETS

 message(STATUS "Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")

-if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+# Cache SUPPORTED_GPU_TARGETS for debug
+set(SUPPORTED_GPU_TARGETS "${SUPPORTED_GPU_TARGETS}" CACHE STRING "List of supported GPU targets")
+
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
    message(STATUS "Enabling XDL instances")
    add_definitions(-DCK_USE_XDL)
    set(CK_USE_XDL "ON")
@@ -234,6 +237,10 @@ endif()
 # new macro CK_TILE_USE_WMMA in order to separately compile examples for MFMA/WMMA
 set(CK_TILE_USE_WMMA 0)

+if (SUPPORTED_GPU_TARGETS MATCHES "gfx10")
+    add_definitions(-DCK_GFX1030_SUPPORT)
+endif()
+
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
    message(STATUS "Enabling WMMA instances")
    add_definitions(-DCK_USE_WMMA)
@@ -335,6 +342,7 @@ endif()
 option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
 option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
 option(ENABLE_ASM_DUMP "Whether to enable assembly dump for kernels." OFF)
+option(ENABLE_JSON_DUMP "Whether to enable json dump for examples." OFF)

 if(USE_BITINT_EXTENSION_INT4)
    add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
@@ -348,6 +356,11 @@ if(ENABLE_ASM_DUMP)
    message("CK compiled with ENABLE_ASM_DUMP set to ${ENABLE_ASM_DUMP}")
 endif()

+if (ENABLE_JSON_DUMP)
+    add_compile_definitions(CK_ENABLE_JSON_DUMP)
+    message("CK compiled with ENABLE_JSON_DUMP set to ${ENABLE_JSON_DUMP}")
+endif()
+
 ## Threads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -22,6 +22,9 @@ Xiaoyan Zhou, 2020
 [Jianfeng Yan](https://github.com/j4yan), 2021-2022
 [Jun Liu](https://github.com/junliume), 2021-2024

+[John Shumway](https://github.com/shumway), [Vidyasagar Ananthan](https://github.com/vidyasagar-amd), [Christopher Millette](https://github.com/cgmillette), [Maksim Podkorytov](https://github.com/tenpercent), [Thomas Ning](https://github.com/ThomasNing),[Andriy Roshchenko](https://github.com/andriy-ca), [Aviral Goel](https://github.com/AviralGoelAMD), [Cong Ma](https://github.com/CongMa13),[Thrupti Raj Lakshmana Gowda](https://github.com/ThruptiRajLakshmanaGowda), [Emily Martins](https://github.com/ecamartins), [Khushbu Agarwal](https://github.com/amd-khushbu), [Sudhir Kylasa](https://github.com/kylasa), [Jia Luo](https://github.com/JiaLuo-CAN),  2025-
+
+
 ## Product Manager
 [John Afaganis](https://github.com/afagaj)

--- a/29
+++ b/29
@@ -1,27 +1,23 @@
+
 FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.4.1
+ARG ROCMVERSION=7.0.1
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
+ENV DEBIAN_FRONTEND=noninteractive

 # Add rocm repository
 RUN set -xe && \
-    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
-    curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl

-RUN if [ "$ROCMVERSION" != "6.5" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60401-1_all.deb && \
-        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
-        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
-    fi
-
-RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
-    amdgpu-install -y --usecase=rocm --no-dkms
+RUN wget https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/noble/amdgpu-install_7.0.1.70001-1_all.deb && \
+    apt install ./amdgpu-install_7.0.1.70001-1_all.deb -y && \
+    apt update && \
+    apt install python3-setuptools python3-wheel -y && \
+    apt install rocm-dev -y

 ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
 ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
@@ -45,7 +41,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libelf-dev \
    libnuma-dev \
    libpthread-stubs0-dev \
-    llvm-amdgpu \
    mpich \
    net-tools \
    pkg-config \
@@ -61,17 +56,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    zip \
    libzstd-dev \
    openssh-server \
-    clang-format-12 \
    clang-format-18 \
    kmod && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
    rm -rf amdgpu-install* && \
-# Remove unnecessary rocm components that take a lot of space
-    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
-
 #Install latest ccache
-RUN git clone https://github.com/ccache/ccache.git && \
+    git clone https://github.com/ccache/ccache.git && \
    cd ccache && mkdir build && cd build && cmake .. && make install && \
 #Install ninja build tracing tools
    cd / && \
--- a/Dockerfile.aiter
+++ b/Dockerfile.aiter
@@ -1,10 +1,8 @@
-ARG BASE_DOCKER="rocm/pytorch:latest"
+ARG BASE_DOCKER="rocm/composable_kernel-private:ck_aiter_base"
 FROM $BASE_DOCKER
 ARG AITER_BRANCH="main"
 ARG CK_AITER_BRANCH="develop"
-RUN groupadd -g 109 render && \
-    usermod -u 1001 jenkins && \
-    groupmod -g 1001 jenkins && \
+RUN groupadd irc && \
    pip install pandas zmq einops && \
    pip install numpy==1.26.2 && \
    sudo mkdir /home/jenkins && \
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4.1"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.0.1"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
--- a/244
+++ b/244
@@ -33,9 +33,6 @@ def nthreads() {
    def nproc = sh(returnStdout: true, script: 'nproc')
    echo "Number of cores: ${nproc}"
    def n = nproc.toInteger()
-    if (n > 32){
-        n /= 2
-    }
    if (n > 64){
        n = 64
    }
@@ -56,7 +53,7 @@ def getBaseDockerImageName(){
    }
    else{
        def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
-        if ( ROCM_numeric.major <= 6 && ROCM_numeric.minor < 5 ){
+        if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 1 ){
            img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
            }
        else{
@@ -152,7 +149,7 @@ def getDockerImage(Map conf=[:]){
        image = conf.get("docker_name", "")
        echo "Using legacy docker: ${image}"
    }
-    else if ( params.BUILD_GFX950 && conf.get("docker_name", "") != "" ){
+    else if ( (params.BUILD_GFX950 || params.RUN_CK_TILE_FMHA_TESTS) && conf.get("docker_name", "") != "" ){
        image = conf.get("docker_name", "")
        echo "Using special docker: ${image}"
    }
@@ -160,9 +157,9 @@ def getDockerImage(Map conf=[:]){
        image = getDockerImageName()
        echo "Using default docker: ${image}"
    }
-    //Check if image exists 
+    //Check if image exists
    def retimage
-    try 
+    try
    {
        echo "Pulling image: ${image}"
        retimage = docker.image("${image}")
@@ -189,11 +186,11 @@ def buildDocker(install_prefix){
        dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f Dockerfile.compiler . "
    }
    else if(params.RUN_AITER_TESTS){
-        image_name = "rocm/composable_kernel:ck_aiter"
+        image_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
        dockerArgs = dockerArgs + " --no-cache -f Dockerfile.aiter --build-arg AITER_BRANCH='${params.aiter_branch}' --build-arg CK_AITER_BRANCH='${params.ck_aiter_branch}' . "
    }
     else if(params.RUN_PYTORCH_TESTS){
-        image_name = "rocm/composable_kernel:ck_pytorch"
+        image_name = "${env.CK_DOCKERHUB}:ck_pytorch"
        dockerArgs = dockerArgs + " --no-cache -f Dockerfile.pytorch --build-arg CK_PYTORCH_BRANCH='${params.ck_pytorch_branch}' . "
    }
   else{
@@ -235,7 +232,7 @@ def cmake_build(Map conf=[:]){
    def setup_args = conf.get("setup_args","")
    // make sure all unit tests always run on develop branch
    def runAllUnitTests = (env.BRANCH_NAME == "develop") ? true : params.RUN_ALL_UNIT_TESTS
-    
+
    if (prefixpath != "/usr/local"){
        setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
    }
@@ -324,7 +321,7 @@ def cmake_build(Map conf=[:]){
                    ${redis_pre_setup_cmd}
                """)
            sh cmd1
-            setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
+            setup_args = " -DCMAKE_HIP_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
        }
        catch(Exception err){
            echo "could not connect to redis server: ${err.getMessage()}. will not use sccache."
@@ -360,7 +357,7 @@ def cmake_build(Map conf=[:]){
            "build_cmd",
            "${build_envs} ninja -j${nt} ${config_targets}"
        )
-        
+
        cmd = conf.get("cmd", """
            ${setup_cmd}
            ${build_cmd}
@@ -452,7 +449,7 @@ def buildHipClangJob(Map conf=[:]){
        checkout scm
        def prefixpath = conf.get("prefixpath", "/opt/rocm")

-        // Jenkins is complaining about the render group 
+        // Jenkins is complaining about the render group
        def dockerOpts
        if ( params.BUILD_INSTANCES_ONLY ){
            dockerOpts = "--group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
@@ -479,7 +476,7 @@ def buildHipClangJob(Map conf=[:]){
        def retimage
        (retimage, image) = getDockerImage(conf)

-        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 20, unit: 'HOURS')
                {
@@ -518,7 +515,7 @@ def Build_CK(Map conf=[:]){
        checkout scm
        def prefixpath = conf.get("prefixpath", "/opt/rocm")

-        // Jenkins is complaining about the render group 
+        // Jenkins is complaining about the render group
        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
        if (conf.get("enforce_xnack_on", false)) {
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
@@ -541,7 +538,7 @@ def Build_CK(Map conf=[:]){
        def image
        def retimage

-        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
            try {
                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
@@ -719,10 +716,10 @@ def process_results(Map conf=[:]){
    env.HSA_ENABLE_SDMA=0
    checkout scm
    //use older image that has user jenkins
-    def image = "rocm/composable_kernel:ck_ub22.04_rocm6.3"
+    def image = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm6.3"
    def prefixpath = "/opt/rocm"

-    // Jenkins is complaining about the render group 
+    // Jenkins is complaining about the render group
    def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
    if (conf.get("enforce_xnack_on", false)) {
        dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
@@ -731,7 +728,7 @@ def process_results(Map conf=[:]){
    def variant = env.STAGE_NAME
    def retimage

-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
        try
        {
            echo "Pulling image: ${image}"
@@ -830,7 +827,7 @@ def run_aiter_tests(Map conf=[:]){
    env.HSA_ENABLE_SDMA=0
    checkout scm
    //use the latest pytorch image
-    def image = "rocm/composable_kernel:ck_aiter"
+    def image = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
    def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins"
    def variant = env.STAGE_NAME
    def retimage
@@ -839,7 +836,7 @@ def run_aiter_tests(Map conf=[:]){
    dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
    echo "Docker flags: ${dockerOpts}"

-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
        try
        {
            echo "Pulling image: ${image}"
@@ -855,13 +852,21 @@ def run_aiter_tests(Map conf=[:]){
    }

    withDockerContainer(image: image, args: dockerOpts) {
-        timeout(time: 45, unit: 'MINUTES'){
+        timeout(time: 5, unit: 'HOURS'){
            try{
                sh "rocminfo"
                sh "python3 --version"
                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_2stage.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_blockscale.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_ep.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_sorting.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_sorting_mxfp4.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_tkw1.py"
            }
            catch(e){
                echo "Throwing error exception while running AITER tests"
@@ -881,7 +886,7 @@ def run_pytorch_tests(Map conf=[:]){
    env.HSA_ENABLE_SDMA=0
    checkout scm
    //use the latest pytorch-nightly image
-    def image = "rocm/composable_kernel:ck_pytorch"
+    def image = "${env.CK_DOCKERHUB}:ck_pytorch"
    def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins"
    def variant = env.STAGE_NAME
    def retimage
@@ -890,7 +895,7 @@ def run_pytorch_tests(Map conf=[:]){
    dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
    echo "Docker flags: ${dockerOpts}"

-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
        try
        {
            echo "Pulling image: ${image}"
@@ -906,7 +911,7 @@ def run_pytorch_tests(Map conf=[:]){
    }

    withDockerContainer(image: image, args: dockerOpts) {
-        timeout(time: 45, unit: 'MINUTES'){
+        timeout(time: 2, unit: 'HOURS'){
            try{
                sh "rocminfo"
                sh "python3 --version"
@@ -926,7 +931,8 @@ def run_pytorch_tests(Map conf=[:]){
 }

 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_PERFORMANCE_TESTS=true
+                                              0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -952,20 +958,20 @@ pipeline {
            defaultValue: '',
            description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
        string(
-            name: 'ROCMVERSION', 
-            defaultValue: '6.4.1',
-            description: 'Specify which ROCM version to use: 6.4.1 (default).')
+            name: 'ROCMVERSION',
+            defaultValue: '7.0.1',
+            description: 'Specify which ROCM version to use: 7.0.1 (default).')
        string(
-            name: 'COMPILER_VERSION', 
-            defaultValue: '', 
+            name: 'COMPILER_VERSION',
+            defaultValue: '',
            description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).')
        string(
-            name: 'COMPILER_COMMIT', 
-            defaultValue: '', 
+            name: 'COMPILER_COMMIT',
+            defaultValue: '',
            description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit (default), or use some specific commit of llvm-project branch.')
        string(
-            name: 'BUILD_COMPILER', 
-            defaultValue: '/opt/rocm/llvm/bin/clang++', 
+            name: 'BUILD_COMPILER',
+            defaultValue: '/opt/rocm/llvm/bin/clang++',
            description: 'Build CK with /opt/rocm/bin/hipcc, /llvm-project/build/bin/clang++, or with /opt/rocm/llvm/bin/clang++ (default).')
        booleanParam(
            name: "RUN_FULL_QA",
@@ -1029,12 +1035,12 @@ pipeline {
            description: "Build CK and run tests on gfx90a (default: ON)")
        booleanParam(
            name: "BUILD_GFX942",
-            defaultValue: false,
-            description: "Build CK and run tests on gfx942 (default: OFF)")
+            defaultValue: true,
+            description: "Build CK and run tests on gfx942 (default: ON)")
        booleanParam(
            name: "BUILD_GFX950",
-            defaultValue: false,
-            description: "Build CK and run tests on gfx950 (default: OFF)")
+            defaultValue: true,
+            description: "Build CK and run tests on gfx950 (default: ON)")
        booleanParam(
            name: "BUILD_GFX10",
            defaultValue: true,
@@ -1121,15 +1127,16 @@ pipeline {
                    agent{ label rocmnode("nogpu") }
                    environment{
                        setup_args = "NO_CK_BUILD"
-                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
-                                -o -not -path \'*.git*\' -iname \'*.hpp\' \
-                                -o -not -path \'*.git*\' -iname \'*.cpp\' \
-                                -o -iname \'*.h.in\' \
-                                -o -iname \'*.hpp.in\' \
-                                -o -iname \'*.cpp.in\' \
-                                -o -iname \'*.cl\' \
+                        execute_cmd = "(cd .. && git ls-files \'*.h\' \
+                                \'*.hpp\' \
+                                \'*.cpp\' \
+                                \'*.h.in\' \
+                                \'*.hpp.in\' \
+                                \'*.cpp.in\' \
+                                \'*.cl\' \
                                | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\' && \
+                                | grep -v 'include/rapidjson' \
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\') && \
                                /cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \
                                -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 \
                                -D __gfx908__ -D __gfx90a__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
@@ -1150,15 +1157,17 @@ pipeline {
                    agent{ label rocmnode("nogpu") }
                    environment{
                        setup_args = "NO_CK_BUILD"
-                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
-                                -o -not -path \'*.git*\' -iname \'*.hpp\' \
-                                -o -not -path \'*.git*\' -iname \'*.cpp\' \
-                                -o -iname \'*.h.in\' \
-                                -o -iname \'*.hpp.in\' \
-                                -o -iname \'*.cpp.in\' \
-                                -o -iname \'*.cl\' \
+                        execute_cmd = "(cd .. && git ls-files \
+                                \'*.h\' \
+                                \'*.hpp\' \
+                                \'*.cpp\' \
+                                \'*.h.in\' \
+                                \'*.hpp.in\' \
+                                \'*.cpp.in\' \
+                                \'*.cl\' \
                                | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\'"
+                                | grep -v 'include/rapidjson' \
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\')"
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
@@ -1201,6 +1210,18 @@ pipeline {
                        cleanWs()
                    }
                }
+                stage("Run AITER Tests on gfx950")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_AITER_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx950")}
+                    steps{
+                        run_aiter_tests()
+                        cleanWs()
+                    }
+                }
            }
        }
        stage("Run Grouped Conv Large Case Tests")
@@ -1272,7 +1293,7 @@ pipeline {
                    agent{ label rocmnode("gfx90a")}
                    environment{
                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
+                        execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake -DCMAKE_PREFIX_PATH=/opt/rocm ../codegen && \
                                           make -j64 check"""
                    }
                    steps{
@@ -1315,7 +1336,7 @@ pipeline {
                    environment{
                        setup_args = "NO_CK_BUILD"
                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
-                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
+                                           make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
                                           cd ../ &&
                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
                    }
@@ -1324,6 +1345,25 @@ pipeline {
                        cleanWs()
                    }
                }
+                stage("Run CK_TILE_FMHA Tests on gfx950")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx950") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx950 && \
+                                           make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
+                                           cd ../ &&
+                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx950 """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
            }
        }
        stage("Run TILE_ENGINE_GEMM Tests")
@@ -1347,23 +1387,15 @@ pipeline {
                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
+                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8" \
+                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j64 benchmark_gemm_fp8_rcr && \
-                                           ./bin/benchmark_gemm_fp8_rcr && \
-                                           ninja -j64 benchmark_gemm_fp16_rcr && \
-                                           ./bin/benchmark_gemm_fp16_rcr && \
-                                           ninja -j64 benchmark_gemm_fp8_crr && \
-                                           ./bin/benchmark_gemm_fp8_crr && \
-                                           ninja -j64 benchmark_gemm_fp16_crr && \
-                                           ./bin/benchmark_gemm_fp16_crr && \
-                                           ninja -j64 benchmark_gemm_fp8_ccr && \
-                                           ./bin/benchmark_gemm_fp8_ccr && \
-                                           ninja -j64 benchmark_gemm_fp16_ccr && \
-                                           ./bin/benchmark_gemm_fp16_ccr && \
-                                           ninja -j64 benchmark_gemm_fp8_rrr && \
-                                           ./bin/benchmark_gemm_fp8_rrr && \
-                                           ninja -j64 benchmark_gemm_fp16_rrr && \
-                                           ./bin/benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_all && \
+                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           ninja -j64 benchmark_gemm_preshuffle_all && \
+                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
                                           ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
                                           ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
                                           ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
@@ -1395,23 +1427,15 @@ pipeline {
                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
+                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8" \
+                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j64 benchmark_gemm_fp8_rcr && \
-                                           ./bin/benchmark_gemm_fp8_rcr && \
-                                           ninja -j64 benchmark_gemm_fp16_rcr && \
-                                           ./bin/benchmark_gemm_fp16_rcr && \
-                                           ninja -j64 benchmark_gemm_fp8_crr && \
-                                           ./bin/benchmark_gemm_fp8_crr && \
-                                           ninja -j64 benchmark_gemm_fp16_crr && \
-                                           ./bin/benchmark_gemm_fp16_crr && \
-                                           ninja -j64 benchmark_gemm_fp8_ccr && \
-                                           ./bin/benchmark_gemm_fp8_ccr && \
-                                           ninja -j64 benchmark_gemm_fp16_ccr && \
-                                           ./bin/benchmark_gemm_fp16_ccr && \
-                                           ninja -j64 benchmark_gemm_fp8_rrr && \
-                                           ./bin/benchmark_gemm_fp8_rrr && \
-                                           ninja -j64 benchmark_gemm_fp16_rrr && \
-                                           ./bin/benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_all && \
+                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           ninja -j64 benchmark_gemm_preshuffle_all && \
+                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
                                           ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
                                           ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
                                           ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
@@ -1426,6 +1450,36 @@ pipeline {
                        cleanWs()
                    }
                }
+                stage("Run TILE_ENGINE_GEMM Tests on gfx1201")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx1201") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
+                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+                                            -D CMAKE_BUILD_TYPE=Release \
+                                            -D GPU_TARGETS="gfx1201" \
+                                            -D GEMM_DATATYPE="fp16" \
+                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -DGEMM_CONFIG_FILE=gfx120x_config.json \
+                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
+                                           ninja -j64 benchmark_gemm_all && \
+                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
+                                           --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           ninja -j64 benchmark_gemm_fp16_rcr && \
+                                           ninja -j64 benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_fp16_crr && \
+                                           ninja -j64 benchmark_gemm_fp16_ccr """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
            }
        }

@@ -1514,7 +1568,7 @@ pipeline {
                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1569,7 +1623,7 @@ pipeline {
                    agent{ label rocmnode("gfx942") }
                    steps{
                        script {
-                            def execute_args = params.NINJA_FTIME_TRACE ? 
+                            def execute_args = params.NINJA_FTIME_TRACE ?
                                """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
                                    -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                    -D CMAKE_BUILD_TYPE=Release \
@@ -1578,8 +1632,8 @@ pipeline {
                                    -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                    -D CMAKE_BUILD_TYPE=Release \
                                    -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
-                            
-                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0")
+
+                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1")
                        }
                        cleanWs()
                    }
@@ -1605,13 +1659,13 @@ pipeline {
                        cleanWs()
                    }
                }
-                stage("Build CK and run Tests on gfx1101")
+                stage("Build CK and run Tests on gfx11")
                {
                    when {
                        beforeAgent true
                        expression { params.BUILD_GFX11.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
-                    agent{ label rocmnode("gfx1101") }
+                    agent{ label 'miopen && (gfx1101 || gfx1100)' }
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DUSE_OPT_GFX11=ON -DCMAKE_CXX_FLAGS=" -O3 " """
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
--- a/README.md
+++ b/README.md
@@ -184,7 +184,7 @@ hours to 1-2 minutes. In order to invoke sccache, you need to run:
 then add the following flags to the cmake command line:

 ```bash
- -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
+ -DCMAKE_HIP_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
 ```

 You may need to clean up the build folder and repeat the cmake and make steps in order to take
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -48,7 +48,7 @@ else()
 endif()

 if (GPU_TARGETS)
-    if (GPU_TARGETS MATCHES "gfx9")
+    if (GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
        add_definitions(-DCK_USE_XDL)
        set(CK_USE_XDL "ON")
    endif()
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -12,6 +12,7 @@ configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
 find_package(ROCM)
 include(ROCMInstallTargets)
 include(ROCMTest)
+find_package(hiprtc REQUIRED)

 rocm_setup_version(VERSION 1.0)

@@ -27,7 +28,7 @@ add_compile_options(-std=c++20)
 file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
 # TODO: Use object library
 add_library(ck_host STATIC ${SOURCES})
-target_link_libraries(ck_host PRIVATE ck_headers)
+target_link_libraries(ck_host PRIVATE ck_headers hiprtc::hiprtc)

 set_target_properties(ck_host PROPERTIES 
    LINKER_LANGUAGE CXX
--- a/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/host/device_batched_gemm_softmax_gemm/operation.hpp"
 #include "ck/host/stringutils.hpp"
@@ -76,28 +76,28 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 //   Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl| Prefetch|
 //       |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per|    Stage|
 //       |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|         |
-  {   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,        1},
-  {   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,        1},
-  {   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,        1},
-  {   256,    128,   256,    32,   128,    32,   8,   8,    2,   32,   32,     1,     8,     4,        1},
-  {   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,        1},
-  {   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,        1},
-  {   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,        1},
-  {   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,        1},
-  {   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,        1},
-  {   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,        1},
-  {   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,        1},
-  {   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,        1},
+  {   256,    256,   128,    32,    64,    32,   8,   8,    2,   16,   16,     4,     8,     4,        1},
+  {   256,    256,   128,    32,   128,    32,   8,   8,    2,   16,   16,     4,     8,     8,        1},
+  {   256,    128,   256,    32,    64,    32,   8,   8,    2,   16,   16,     2,    16,     4,        1},
+  {   256,    128,   256,    32,   128,    32,   8,   8,    2,   16,   16,     2,    16,     8,        1},
+  {   256,    128,   128,    64,    64,    32,   8,   8,    2,   16,   16,     2,     8,     4,        1},
+  {   256,    128,   128,    32,    64,    32,   8,   8,    2,   16,   16,     2,     8,     4,        1},
+  {   256,    128,   128,    64,   128,    32,   8,   8,    2,   16,   16,     2,     8,     8,        1},
+  {   256,    128,   128,    32,   128,    32,   8,   8,    2,   16,   16,     2,     8,     8,        1},
+  {   256,    128,   256,    32,   128,    32,   8,   8,    2,   16,   16,     2,    16,     8,        1},
+  {   256,    128,   256,    32,    64,    32,   8,   8,    2,   16,   16,     2,    16,     4,        1},
+  {   256,    128,   256,    64,   128,    32,   8,   8,    2,   16,   16,     2,    16,     8,        1},
+  {   256,    128,   256,    64,    64,    32,   8,   8,    2,   16,   16,     2,    16,     4,        1},
 // Padded fallback kernel  
-  {   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,        1},
-  {   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,        1},
+  {   256,    128,   128,    64,   128,    32,   8,   8,    2,   16,   16,     2,     8,     8,        1},
+  {   256,    128,    64,    32,   128,    32,   8,   8,    2,   16,   16,     2,     4,     8,        1},
 // Irregular k
-  {   256,    256,   128,    40,    64,    32,   4,   4,    2,   32,   32,     2,     4,     2,        1},
-  {   256,    256,   128,    40,   128,    32,   4,   4,    2,   32,   32,     2,     4,     4,        1},
-  {   256,    128,   256,    40,    64,    32,   4,   4,    2,   32,   32,     1,     8,     2,        1},
-  {   256,    128,   256,    40,   128,    32,   4,   4,    2,   32,   32,     1,     8,     4,        1},
-  {   256,    128,   128,    40,    64,    32,   4,   4,    2,   32,   32,     1,     4,     2,        1},
-  {   256,    128,   128,    40,   128,    32,   4,   4,    2,   32,   32,     1,     4,     4,        1},
+  {   256,    256,   128,    48,    64,    32,   4,   4,    2,   16,   16,     4,     8,     4,        1},
+  {   256,    256,   128,    48,   128,    32,   4,   4,    2,   16,   16,     4,     8,     8,        1},
+  {   256,    128,   256,    48,    64,    32,   4,   4,    2,   16,   16,     2,    16,     4,        1},
+  {   256,    128,   256,    48,   128,    32,   4,   4,    2,   16,   16,     2,    16,     8,        1},
+  {   256,    128,   128,    48,    64,    32,   4,   4,    2,   16,   16,     2,     8,     4,        1},
+  {   256,    128,   128,    48,   128,    32,   4,   4,    2,   16,   16,     2,     8,     8,        1},
        // clang-format on
    };

@@ -200,28 +200,28 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 //         _MBlock_MWaveMPerXdl| ScalarPerVector
 //         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl
 //                             |                
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 16, 1,16>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 16, 1,16>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 16, 1,16>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 16, 1,16>,               4},
+  {              S<1, 32, 1, 8>,               4},
 // Padded fallback kernel
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
 // Irregular k
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
        // clang-format on
    };

--- a/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/host/device_gemm_multiple_d/operation.hpp"
 #include "ck/host/stringutils.hpp"
@@ -81,16 +81,16 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 //   Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per| Prefetch|
 //       |      |      |      |    |    |     |     | Wave| Wave|    Stage|
 //       |      |      |      |    |    |     |     |     |     |         |
-  {   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,        1},
-  {   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,        1},
-  {   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,        1},
-  {   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,        1},
-  {   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,        1},
+  {   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,        1},
+  {   256,   128,   256,    32,   8,   8,   16,   16,    4,    8,        1},
+  {   128,   128,   128,    32,   8,   8,   16,   16,    8,    4,        1},
+  {   256,   128,   128,    32,   8,   8,   16,   16,    4,    4,        1},
+  {   128,   128,    64,    32,   8,   8,   16,   16,    4,    4,        1},
+  {   128,    64,   128,    32,   8,   8,   16,   16,    4,    4,        1},
+  {   256,   128,    64,    32,   8,   8,   16,   16,    4,    2,        1},
+  {   256,    64,   128,    32,   8,   8,   16,   16,    2,    4,        1},
 //  Irregular tile
-  {    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,        1},
+  {    64,    32,    32,    32,   8,   8,   16,   16,    2,    2,        1},
        // clang-format on
    };

@@ -194,14 +194,14 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 //         _MBlock_MWaveMPerXdl| ScalarPerVector
 //         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl
 //                             |                
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 16, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 4>,               8},
-  {              S<1, 16, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 16, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 4>,               4},
+  {              S<1, 16, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 32, 1, 8>,               4},
 //  Irregular tile
  {              S<1, 16, 1, 4>,               1},
        // clang-format on
--- a/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
 #include <iostream>
@@ -55,12 +55,12 @@ std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> Operation_Conv_Fwd_Xdl_Cshuffle::Cr
 //   Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per| Prefetch|
 //       |      |      |      |    |    |     |     | Wave| Wave|    Stage|
 //       |      |      |      |    |    |     |     |     |     |         |
-  {   64,   64,   32,    32,   8,   8,   32,   32,    2,    1,        1},
-  {   256,   128,   256,    32,   8,   8,   32,   32,    4,    2,        1},
-  {   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   64,   64,   64,    32,   8,   8,   32,   32,    2,    2,        1},
-  {   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,        1},
-  {   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,        1}
+  {   64,     64,   32,     32,   8,   8,   16,   16,    4,    2,        1},
+  {   256,   128,   256,    32,   8,   8,   16,   16,    8,    4,        1},
+  {   256,   128,   128,    32,   8,   8,   16,   16,    4,    4,        1},
+  {   64,     64,   64,     32,   8,   8,   16,   16,    4,    4,        1},
+  {   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,        1},
+  {   128,   128,   128,    32,   8,   8,   16,   16,    8,    4,        1}
        // clang-format on
    };

@@ -116,11 +116,11 @@ std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> Operation_Conv_Fwd_Xdl_Cshuffle::Cr
 //         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl
 //                             |                
  {              S<1, 16, 1, 4>,               1},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 32, 1, 8>,               8},
+  {              S<1, 16, 1, 16>,              4},
+  {              S<1, 32, 1, 8>,               4},
  {              S<1, 16, 1, 4>,               1},
-  {              S<1, 32, 1, 8>,               8},
-  {              S<1, 16, 1, 8>,               8}
+  {              S<1, 32, 1, 8>,               4},
+  {              S<1, 16, 1, 8>,               4}
        // clang-format on
    };

@@ -223,8 +223,9 @@ extern "C" __global__ void run_${name}(
    constexpr ck::LoopScheduler LoopSched = ck::make_default_loop_scheduler();

    // GridwiseGemm
-    using GridwiseGemm = DeviceConv::GridwiseGemm;
-
+    using GridwiseGemm = ck::conditional_t<ck::get_warp_size() == 64,
+                                           typename DeviceConv::GridwiseGemm64,
+                                           typename DeviceConv::GridwiseGemm32>;
    static constexpr auto I0 = ck::Number<0>{};

    ck::tensor_operation::device::device_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
--- a/codegen/src/utils.cpp
+++ b/codegen/src/utils.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/host/utils.hpp"

@@ -13,7 +13,8 @@ std::size_t integer_divide_ceil(std::size_t x, std::size_t y)

 const std::unordered_set<std::string>& get_xdlop_archs()
 {
-    static std::unordered_set<std::string> supported_archs{"gfx90a", "gfx908", "gfx942"};
+    static std::unordered_set<std::string> supported_archs{
+        "gfx90a", "gfx908", "gfx942", "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201"};
    return supported_archs;
 }

--- a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
@@ -160,9 +160,10 @@ struct Epilogue
                                              Epilogue{1.0f, 1.0f});
    out_host.SetZero();
    ref_invoker.Run(ref_argument);**/
-
+    int i = 0;
    for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
    {
+        std::cout << "Testing solution " << std::to_string(++i) << std::endl;
        // substitute instance values into the template
        auto src = ck::host::InterpolateString(
            conv_compile_check,
--- a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
@@ -160,9 +160,10 @@ struct Epilogue
                                              Epilogue{1.0f, 1.0f});
    out_host.SetZero();
    ref_invoker.Run(ref_argument);**/
-
+    int i = 0;
    for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
    {
+        std::cout << "Testing solution " << std::to_string(++i) << std::endl;
        // substitute instance values into the template
        auto src = ck::host::InterpolateString(
            conv_compile_check,
--- a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
@@ -160,9 +160,10 @@ struct Epilogue
                                              Epilogue{1.0f, 1.0f});
    out_host.SetZero();
    ref_invoker.Run(ref_argument);**/
-
+    int i = 0;
    for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
    {
+        std::cout << "Testing solution " << std::to_string(++i) << std::endl;
        // substitute instance values into the template
        auto src = ck::host::InterpolateString(
            conv_compile_check,
--- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
@@ -160,9 +160,10 @@ struct Epilogue
                                              Epilogue{1.0f, 1.0f});
    out_host.SetZero();
    ref_invoker.Run(ref_argument);**/
-
+    int i = 0;
    for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
    {
+        std::cout << "Testing solution " << std::to_string(++i) << std::endl;
        // substitute instance values into the template
        auto src = ck::host::InterpolateString(
            conv_compile_check,
--- a/docs/Contributors_Guide.rst
+++ b/docs/Contributors_Guide.rst
@@ -5,100 +5,58 @@
 .. _contributing-to:

 ********************************************************************
-Contributor's guide
+Contributing to Composable Kernel
 ********************************************************************

-This chapter explains the rules for contributing to the Composable Kernel project, and how to contribute.
+Review the `Composable Kernel documentation <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_ before contributing to the Composable Kernel project. This documentation provides information about core concepts and configurations, as well as providing :doc:`steps for building Composable Kernel <install/Composable-Kernel-install>`. Some of this information is also available in the `Composable Kernel README <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_.

-Getting started
-===============
-
-#. **Documentation:** Before contributing to the library, familiarize yourself with the
-   `Composable Kernel User Guide <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_.
-   It provides insight into the core concepts, environment configuration, and steps to obtain or
-   build the library. You can also find some of this information in the
-   `README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
-   on the project's GitHub page.
-   <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_
-   from the AMD Community portal. It offers a deeper understanding of the library's objectives and showcases its performance capabilities.
-#. **General information:** For broader information about AMD products, consider exploring the
-   `AMD Developer Central portal <https://www.amd.com/en/developer.html>`_.
-
-How to contribute
-===================
-
-You can make an impact by reporting issues or proposing code enhancements through pull requests.
+Consult the `AMD Developer Central portal <https://www.amd.com/en/developer.html>`_ for more information about AMD products.

 Reporting issues
----------------
+=================

-Use `Github issues <https://github.com/ROCm/composable_kernel/issues>`_
-to track public bugs and enhancement requests.
+Use `Github issues <https://github.com/ROCm/composable_kernel/issues>`_ to log and track issues and enhancement requests.

-If you encounter an issue with the library, please check if the problem has already been
-reported by searching existing issues on GitHub. If your issue seems unique, please submit a new
-issue. All reported issues must include:
+If you encounter an issue with the Composable Kernel library, search the existing GitHub issues to determine whether the problem has already been
+reported. If it hasn't, submit a new issue that includes:

-* A comprehensive description of the problem, including:
+* A description of the problem, including what you observed, what you were expecting, and why this was an issue.
+ 
+* Your configuration details, including the GPU, OS, and ROCm version, and any Docker image you used.

-  * What did you observe?
-  * Why do you think it is a bug (if it seems like one)?
-  * What did you expect to happen? What would indicate the resolution of the problem?
-  * Are there any known workarounds?
+* The steps to reproduce the issue, including any CMake command you used to build the library, as well as the frequency of the issue.

-* Your configuration details, including:
+* Any workarounds you've found and what you expect in a resolution. 

-  * Which GPU are you using?
-  * Which OS version are you on?
-  * Which ROCm version are you using?
-  * Are you using a Docker image? If so, which one?

-* Steps to reproduce the issue, including:
+Contributing to the codebase
+=============================

-  * What actions trigger the issue? What are the reproduction steps?
+All external contributors to the Composable Kernel codebase must follow these guidelines:

-    * If you build the library from scratch, what CMake command did you use?
+* Use the correct branch: Use your own branch for your changes. Create your branch from the develop branch. 

-  * How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue?
+* Describe your changes: Provide the motivation for the changes and a general description of all code changes.

-Before submitting any issue, ensure you have addressed all relevant questions from the checklist.
+* Add design documents for major changes: Major architectural changes must be accompanied by comprehensive design documents uploaded with your pull request. 

-Creating Pull Requests
----------------------
+* Add inline documentation: Include relevant documentation and inline comments with your code changes.

-You can submit `Pull Requests (PR) on GitHub
-<https://github.com/ROCm/composable_kernel/pulls>`_.
+* Link your pull request to related issues: Add links to any issues resolved by your changes in your pull request description.

-All contributors are required to develop their changes on a separate branch and then create a
-pull request to merge their changes into the `develop` branch, which is the default
-development branch in the Composable Kernel project. All external contributors must use their own
-forks of the project to develop their changes.
+* Verify and test the changes: Run all relevant existing tests and write new tests for any new functionality that isn't covered by existing tests.

-When submitting a Pull Request you should:
+* Provide performance numbers: Include documentation showing before and after performance numbers for any changes that potentially impact build times or run times. 

-* Describe the change providing information about the motivation for the change and a general
-  description of all code modifications.
+* Keep your branch up to date: Regularly rebase or merge the develop branch back into your feature branch. This should be done both prior to creating your pull request and during the review process.

-* Verify and test the change:
+* Ensure a manageable pull request size: Pull requests should be limited to approximately one thousand lines. If your changes significantly exceed one thousand lines, break them into smaller pull requests that can be reviewed independently.

-  * Run any relevant existing tests.
-  * Write new tests if added functionality is not covered by current tests.
+* Use pre-commit hooks to adhere to the coding style: Composable Kernel's coding style is defined in `.clang-format <https://github.com/ROCm/composable_kernel/blob/develop/.clang-format>`_. Use the provided pre-commit hooks to run clang formatting and linting. Instructions on installing pre-commit hooks are available in the `README file <https://github.com/ROCm/composable_kernel/blob/develop/.clang-format>`_. 

-* Ensure your changes align with the coding style defined in the ``.clang-format`` file located in
-  the project's root directory. We leverage `pre-commit` to run `clang-format` automatically. We
-  highly recommend contributors utilize this method to maintain consistent code formatting.
-  Instructions on setting up `pre-commit` can be found in the project's
-  `README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
+Forks require an approver from AMD to trigger continuous integration (CI) testing. This approval process is necessary for security and resource management.

-* Link your PR to any related issues:
+Depending on the complexity of your changes, an  AMD developer might need to pull your changes and perform additional fixes or modifications before merging. This collaborative approach ensures compatibility with internal systems and standards.

-  * If there is an issue that is resolved by your change, please provide a link to the issue in
-    the description of your pull request.
+You can see a complete list of pull requests on the `Composable Kernel GitHub page <https://github.com/ROCm/composable_kernel/pulls>`_.

-* For larger contributions, structure your change into a sequence of smaller, focused commits, each
-  addressing a particular aspect or fix.
-
-Following the above guidelines ensures a seamless review process and faster assistance from our
-end.
-
-Thank you for your commitment to enhancing the Composable Kernel project! 
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -39,6 +39,7 @@ The Composable Kernel repository is located at `https://github.com/ROCm/composab
    * :doc:`Composable Kernel API reference <./doxygen/html/namespace_c_k>`
    * :doc:`CK Tile API reference <./doxygen/html/namespaceck__tile>`
    * :doc:`Composable Kernel complete API class list <./doxygen/html/annotated>`
+    * :doc:`Composable Kernel glossary <./reference/Composable-Kernel-Glossary>`
    
 To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.

--- a/docs/reference/Composable-Kernel-Glossary.rst
+++ b/docs/reference/Composable-Kernel-Glossary.rst
@@ -0,0 +1,256 @@
+.. meta::
+  :description: Composable Kernel glossary of terms
+  :keywords: composable kernel, glossary
+
+***************************************************
+Composable Kernel glossary
+
+***************************************************
+
+.. glossary::
+    :sorted:
+
+    arithmetic logic unit
+        The arithmetic logic unit (ALU) is the GPU component responsible for arithmetic and logic operations.
+
+    compute unit
+        The compute unit (CU) is the parallel vector processor in an AMD GPU with multiple :term:`ALUs<arithmetic logic unit>`. Each compute unit will run all the :term:`wavefronts<wavefront>` in a :term:`work group>`. A compute unit is equivalent to NVIDIA's streaming   multiprocessor.
+
+    matrix core
+        A matrix core is a specialized GPU unit that accelerate matrix operations for AI and deep learning tasks. A GPU contains multiple matrix cores.
+
+    register
+        Registers are the fastest tier of memory. They're used for storing temporary values during computations and are private to the :term:`work-items<work-item>` that use them.
+
+    VGPR
+        See :term:`vector general purpose register`.
+
+    vector general purpose register 
+        A vector general purpose register (VGPR) is a :term:`register` that stores individual thread data. Each thread in a :term:`wave<wavefront>` has its own set of VGPRs for private variables and calculations. 
+
+    SGPR
+        See :term:`scalar general purpose register`.
+
+    scalar general purpose register
+        A scalar general purpose register (SGPR) is a :term:`register` shared by all the :term:`work items<work item>` in a :term:`wave<wavefront>`. SGPRs are used for constants, addresses, and control flow common across the entire wave.
+
+    LDS
+        See :term:`local data share`.
+
+    local data share
+        Local data share (LDS) is high-bandwidth, low-latency on-chip memory accessible to all the :term:`work-items<work-item>` in a :term:`work group`. LDS is equivalent to NVIDIA's shared memory. 
+
+    LDS banks
+        LDS banks are a type of memory organization where consecutive addresses are distributed across multiple memory banks for parallel access. LDS banks are used to prevent memory access conflicts and improve bandwidth when LDS is used.
+
+    global memory
+        The main device memory accessible by all threads, offering high capacity but higher latency than shared memory.
+
+    pinned memory
+        Pinned memory is :term:`host` memory that is page-locked to accelerate transfers between the CPU and GPU.
+
+    dense tensor
+        A dense tensor is a tensor where most of its elements are non-zero. Dense tensors are typically stored in a contiguous block of memory.
+
+    sparse tensor
+        A sparse tensor is a tensor where most of its elements are zero. Typically only the non-zero elements of a sparse tensor and their indices are stored.
+
+    host
+        Host refers to the CPU and the main memory system that manages GPU execution. The host is responsible for launching kernels, transferring data, and coordinating overall computation.
+
+    device
+        Device refers to the GPU hardware that runs parallel kernels. The device contains the :term:`compute units<compute unit>`, memory hierarchy, and specialized accelerators.
+
+    work-item
+        A work-item is the smallest unit of parallel execution. A work-item runs a single independent instruction stream on a single data element. A work-item is equivalent to an NVIDIA thread.
+
+    wavefront
+        Also referred to as a wave, a wavefront is a group of :term:`work-items<work-item>` that run the same instruction. A wavefront is equivalent to an NVIDIA warp.
+
+    work group
+        A work group is a collection of :term:`work-items<work-item>` that can synchronize and share memory. A work group is equivalent to NVIDIA's thread block. 
+
+    grid
+        A grid is a collection of :term:`work groups<work group>` that run a kernel. Each work group within the grid operates independently and can be scheduled on a different :term:`compute unit`. A grid can be organized into one, two, or three dimensions. A grid is equivalent to an NVIDIA thread block.
+
+    block Size
+        The block size is the number of :term:`work-items<work-item>` in a :term:`compute unit`.
+
+    SIMT
+        See :term:`single-instruction, multi-thread`
+
+    single-instruction, multi-thread 
+        Single-instruction, multi-thread (SIMT) is a parallel computing model where all the :term:`work-items<work-item>` within a :term:`wavefront` run the same instruction on different data. 
+
+    SIMD
+        See :term:`single-instruction, multi-data`
+
+    single-instruction, multi-data
+        Single-instruction, multi-data (SIMD) is a parallel computing model where the same instruction is run with different data simultaneously. 
+
+    occupancy
+        The ratio of active :term:`wavefronts<wavefront>` to the maximum possible number of wavefronts.
+
+    kernel
+        A kernel is a function that runs an :term:`operation` or a collection of operations. A kernel will run in parallel on several :term:`work-items<work-item>` across the GPU. In Composable Kernel, kernels require :term:`pipelines<pipeline>`.
+
+    operation
+        An operation is a computation on input data. 
+        
+    pipeline
+        A Composable Kernel pipeline schedules the sequence of operations for a :term:`kernel`, such as the data loading, computation, and storage phases. A pipeline consists of a :term:`problem` and a :term:`policy`. 
+
+    tile partitioner
+        The tile partitioner defines the mapping between the :term:`problem` dimensions and GPU hierarchy. It specifies :term:`workgroup`-level :term:`tile` sizes and determines :term:`grid` dimensions by dividing the problem size by the tile sizes.
+
+    problem
+        The problem is the part of the :term:`pipeline` that defines input and output shapes, data types, and mathematical :term:`operations<operation>`.
+
+    policy
+        The policy is the part of the :term:`pipeline` that defines memory access patterns and hardware-specific optimizations.
+
+    user customized tile pipeline
+        A customized :term:`tile` :term:`pipeline` that combines custom :term:`problem` and :term:`policy` components for specialized computations. 
+
+    user customized tile pipeline optimization
+        The process of tuning the :term:`tile` size, memory access pattern, and hardware utilization for specific workloads.
+
+    tile programming API
+        The :term:`tile` programming API is Composable Kernel's high-level interface for defining tile-based computations with predefined hardware mappings for data loading and storing.
+
+    coordinate transformation primitives
+        Coordinate transformation primitives are Composable Kernel utilities for converting between different coordinate systems.
+
+    reference kernel
+        A reference :term:`kernel` is a baseline kernel implementation used to verify correctness and performance. Composable Kernel makes two reference kernels, one for CPU and one for GPU, available.
+
+    launch parameters
+        Launch parameters are the configuration values, such as :term:`grid` and :term:`block size`, that determine how a :term:`kernel` is mapped to hardware resources.
+
+    memory coalescing
+        Memory coalescing is an optimization strategy where consecutive :term:`work-items<work-item>` access consecutive memory addresses in such a way that a single memory transaction serves multiple work-items.
+
+    alignment
+        Alignment is a memory management strategy where data structures are stored at addresses that are multiples of a specific value.
+
+
+    bank conflict
+        A bank conflict occurs when multiple :term:`work-items<work-item>` in a :term:`wavefront` access different addresses that map to the same shared memory bank.
+
+    padding
+        Padding is the addition of extra elements, often zeros, to tensor edges in order to control output size in convolution and pooling, or to align data for memory access.
+
+    transpose
+        Transpose is an :term:`operation` that rearranges the order of tensor axes, often for the purposes of matching :term:`kernel` input formats or optimize memory access patterns.
+
+    permute
+        Permute is an :term:`operation` that rearranges the order of tensor axes, often for the purposes of matching :term:`kernel` input formats or optimize memory access patterns.
+
+    host-device transfer
+        A host-device transfer is the process of moving data between :term:`host` and :term:`device` memory. 
+
+    stride
+        A stride is the step size to move from one element to the next in a specific dimension of a tensor or matrix. In convolution and pooling, the stride determines how far the :term:`kernel` moves at each step.
+
+    dilation
+        Dilation is the spacing between :term:`kernel` elements in convolution :term:`operations<operation>`, allowing the receptive field to grow without increasing kernel size.
+
+    Im2Col
+        Im2Col is a data transformation technique that converts image data to column format.
+
+    Col2Im
+        Col2Im is a data transformation technique that converts column data to image format.
+
+    fast changing dimension
+        The fast changing dimension is the innermost dimension in memory layout.
+
+    outer dimension
+        The outer dimension is the slower-changing dimension in memory layout.
+
+    inner dimension
+        The inner dimension is the faster-changing dimension in memory layout.
+
+    tile
+        A tile is a sub-region of a tensor or matrix that is processed by a :term:`work group` or :term:`work-item`. Rectangular data blocks are the unit of computation and memory transfer in Composable Kernel, and are the basis for tiled algorithms.
+
+    block tile
+        A block tile is a memory :term:`tile` processed by a :term:`work group`.
+
+    wave tile
+        A wave :term:`tile` is a sub-tile processed by a single :term:`wavefront` within a :term:`work group`. The wave tile is the base level granularity of a :term:`single-instruction, multi-thread (SIMD)<single-instruction, multi-thread>` model.
+
+    tile distribution
+        The tile distribution is the hierarchical data mapping from :term:`work-items<work-item>` to data in memory.
+
+    tile window
+        Viewport into a larger tensor that defines the current tile's position and boundaries for computation.
+
+    load tile
+        Load tile is an operation that transfers data from :term:`global memory` or the :term:`load data share` to :term:`vector general purpose registers<vector general purpose register>`.
+
+    store tile
+        Store tile is an operation that transfers data from  :term:`vector general purpose registers<vector general purpose register>` to :term:`global memory` or the :term:`load data share`.
+
+    descriptor
+        Metadata structure that defines :term:`tile` properties, memory layouts, and coordinate transformations for Composable Kernel :term:`operations<operation>`.
+
+    input
+        See :term:`problem shape`.
+
+    problem shape
+        The problem shape defines the dimensions and data types of input tensors that define the :term:`problem`.
+
+    vector
+        The vector is the smallest data unit processed by an individual :term:`work-item`. A vectors is typically four to sixteen elements, depending on data type and hardware.
+
+    elementwise
+        An elementwise :term:`operation` is an operation applied to each tensor element independently. 
+
+    epilogue
+        The epilogue is the final stage of a kernel. Activation functions, bias, and other post-processing steps are applied in the epilogue. 
+
+    Add+Multiply
+        See :term:`fused add multiply`.
+
+    fused add multiply
+        A common fused :term:`operation` in machine language and linear algebra, where an :term:`elementwise` addition is immediately followed by a multiplication. Fused add multiply is often used for bias and scaling in neural network layers.
+
+    MFMA
+        See :term:`matrix fused multiply-add`.
+
+    matrix fused multiply-add
+        Matrix fused multiply-add (MFMA) is a :term:`matrix core` instruction for GEMM :term:`operations<operation>`. 
+
+    GEMM
+        See :term:`general matrix multiply`.
+
+    general matrix multiply 
+        A general matrix multiply (GEMM) is a Core matrix :term:`operation` in linear algebra and deep learning. A GEMM is defined as :math:`C = {\alpha}AB + {\beta}C`, where :math:`A`, :math:`B`, and :math:`C` are matrices, and :math:`\alpha` and :math:`\beta` are scalars. 
+
+    VGEMM
+        See :term:`naive GEMM`.
+
+    vanilla GEMM
+        See :term:`naive GEMM`.
+
+    naive GEMM 
+        The naive GEMM, sometimes referred to as a vanilla GEMM or VGEMM, is the simplest form of :term:`GEMM` in Composable Kernel. The naive GEMM is defined as :math:`C = AB`, where :math:`A`, :math:`B`, and :math:`C` are matrices. The naive GEMM is the baseline GEMM that all other GEMM :term:`operations<operation>` build on.
+
+    GGEMM
+        See :term:`grouped GEMM`.
+
+    grouped GEMM
+        A :term:`kernel` that calls multiple :term:`VGEMMs<naive GEMM>`. Each call can have a different :term:`problem shape`. 
+
+    batched GEMM
+        A :term:`kernel` that calls :term:`VGEMMs<naive GEMM>` with different batches of data. All the data batches have the same :term:`problem shape`. 
+
+    Split-K GEMM
+        Split-K GEMM is a parallelization strategy that partitions the reduction dimension (K) of a :term:`GEMM` across multiple :term:`compute units<compute unit>`, increasing parallelism for large matrix multiplications.
+
+    GEMV
+        See :term:`general matrix vector multiplication`
+
+    general matrix vector multiplication
+        General matrix vector multiplication (GEMV) is an :term:`operation` where a matrix is multiplied by a vector, producing another vector. 
+
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -34,8 +34,14 @@ subtrees:
    title: Composable Kernel vector utilities
  - file: reference/Composable-Kernel-wrapper.rst
    title: Composable Kernel wrapper
+  - file: doxygen/html/namespace_c_k.rst
+    title: CK API reference 
+  - file: doxygen/html/namespaceck__tile.rst
+    title: CK Tile API reference
  - file: doxygen/html/annotated.rst
-    title: Composable Kernel class list
+    title: Full API class list
+  - file: reference/Composable-Kernel-Glossary.rst
+    title: Glossary

 - caption: About
  entries:
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -44,8 +44,7 @@ list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllv
 example_compile_options(example_gemm_xdl_fp8_v3 PRIVATE ${GEMM_OPTIONS})
 example_compile_options(example_gemm_xdl_bf16_v3 PRIVATE ${GEMM_OPTIONS})

-
-list(APPEND gpu_list gfx942 gfx950)
+list(APPEND gpu_list gfx942 gfx950 gfx1200 gfx1201 gfx12-generic)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list AND target EQUAL 0)
@@ -89,7 +88,14 @@ foreach(gpu IN LISTS GPU_TARGETS)

        add_example_executable(example_gemm_xdl_lds_direct_load_fp16 gemm_xdl_lds_direct_load_fp16.cpp)
        add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp16)
+        set(target 1)
+    endif()
+endforeach()

+list(APPEND gpu_list gfx90a gfx942 gfx950 gfx1200 gfx1201 gfx12-generic)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list AND target EQUAL 0)
        add_example_executable(example_gemm_xdl_bf16_streamk_v3 gemm_xdl_bf16_streamk_v3.cpp)
        add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_streamk_v3)

@@ -99,6 +105,16 @@ foreach(gpu IN LISTS GPU_TARGETS)
    endif()
 endforeach()

+list(APPEND gpu_list_tf32 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
+        add_example_executable(example_gemm_xdl_lds_direct_load_fp32_tf32 gemm_xdl_lds_direct_load_fp32_tf32.cpp)
+        add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp32_tf32)
+        set(target 1)
+    endif()
+endforeach()
+
 add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)

--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -310,10 +310,14 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
    return true;
 }

-template <typename DataType>
+template <typename DataType, typename ComputeDataType = DataType>
 inline __host__ __device__ constexpr double get_rtol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
    {
        return 1e-3;
    }
@@ -351,10 +355,14 @@ inline __host__ __device__ constexpr double get_rtol()
    }
 }

-template <typename DataType>
+template <typename DataType, typename ComputeDataType = DataType>
 inline __host__ __device__ constexpr double get_atol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
    {
        return 1e-3;
    }
--- a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -199,9 +199,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -37,7 +37,7 @@ using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffl
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           2,              S<1, 16, 1, 16>,               8, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           2,              S<1, 16, 1, 16>,               4, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
 // clang-format on

 using DeviceGemmInstance = DeviceGemmInstance1;
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -249,9 +249,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -38,14 +38,14 @@ using DeviceGemmV2Instance =
        AElementOp, BElementOp, CElementOp, GemmDefault, 
        256, Scale_Block_N, Scale_Block_K,
        128, 128,
-        KPerBlock, 8, 32,
-        32,   32,
-        4,    1,
+        KPerBlock, 8, 16,
+        16,   16,
+        8,    2,
        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
        2, 8, 8, 0,
        S<2, 128, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
-        2, 32, 32, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        2, 16, 16, 0,
+        1, 1, S<1, 16, 1, 16>, 4,
        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, CDataType, CDataType, PermuteA, PermuteB>;

 // clang-format on
@@ -281,9 +281,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp16_v2.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_v2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -33,13 +33,13 @@ using DeviceGemmInstance =
        2,   256,
        256, 256, 
        32, 8, 4,
-        32,   32,
-        4,    4, 
+        16,   16,
+        8,    8, 
        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
        2, 8, 8, 0,
        S<8, 32, 1>,  S<0, 2, 1>,  S<0, 2, 1>,
        1, 8, 4, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        1, 1, S<1, 32, 1, 8>, 4,
        ck::LoopScheduler::Default, ck::PipelineVersion::v1>;
 // clang-format on

--- a/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
@@ -28,7 +28,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa

 static constexpr bool PermuteA = false;
 static constexpr bool PermuteB = false;
-
+static constexpr int KPack     = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
 // clang-format off
 #if 0
 using DeviceGemmV2Instance = 
@@ -56,14 +56,14 @@ using DeviceGemmV2Instance =
        AElementOp, BElementOp, CElementOp, GemmDefault, 
        256,
        256, 256,
-        128, 16, 32,
-        32,   32,
-        4,    4,
+        128, 16, KPack,
+        16,   16,
+        8,    8,
        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
        2, 16, 16, 0,
        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
        2, 32, 32, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        1, 1, S<1, 32, 1, 8>, 4,
        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, F8, F8, PermuteA, PermuteB>;

 #endif
@@ -160,7 +160,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    auto gemm = DeviceGemmV2Instance{};

    // weight pre-shuffle
-    int KPack = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
    int NLane = gemm.GetPreShuffleParameters();
    int KLane = 64 / NLane;

@@ -269,9 +268,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -38,14 +38,14 @@ using DeviceGemmV2Instance =
        AElementOp, BElementOp, CElementOp, GemmDefault, 
        256,
        128, 128,
-        KPerBlock, 16, 32,
-        32,   32,
-        2,    2,
+        KPerBlock, 16, 16,
+        16,   16,
+        4,    4,
        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
        2, 16, 16, 0,
        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
-        2, 32, 32, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        2, 16, 16, 0,
+        1, 1, S<1, 32, 1, 8>, 4,
        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2, ADataType, ADataType, PermuteA, PermuteB>;

 // clang-format on
@@ -247,9 +247,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp8_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -36,7 +36,7 @@ using DeviceGemmV2Instance =
        2, 16, 16, 0,
        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
        2, 16, 16, 0,
-        1, 2, S<1, 32, 1, 8>, 8,
+        1, 2, S<1, 32, 1, 8>, 4,
        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ck::f8_t>;
 // clang-format on

--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp32_tf32.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp32_tf32.cpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "common.hpp"
+
+#define USING_DIRECT_LOADS 1
+#if USING_DIRECT_LOADS
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp"
+#else
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#endif
+
+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
+using F32 = float;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F32;
+using ComputeDataType  = ck::tf32_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+#if USING_DIRECT_LOADS
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer|
+// ######| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockLds|
+// ######| CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| LoopScheduler      |  pipeline ver           | gemm type  |
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|
+// ######| XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,
+           8,   8,   32,   32,    2,    2,      S<4, 8, 8>,     S<1, 0, 2>,              2,              1,         1,      S<4, 8, 8>,     S<1, 0, 2>,             2,              1,         1,
+           1,           1,               S<1, 8, 1, 8>,               4,   ck::LoopScheduler::Default, ck::PipelineVersion::v4, ComputeDataType>;
+// clang-format on
+#else
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 8, 1, 8>,               4>;
+// clang-format on
+#endif
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CElementOp,
+                                                                        ComputeDataType,
+                                                                        ComputeDataType>;
+
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
--- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -36,7 +36,7 @@ using BDataType   = ck::half_t;
 using CDataType   = ck::half_t;
 using AccDataType = float;
 #else  
-                    <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    16,   64,     4,  4,   16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,             4,      4,        7,               1>;
+                    <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    16,   128,     4,  4,   16,   16,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,             4,      4,        7,               1>;
 using ADataType   = float;
 using BDataType   = float;
 using CDataType   = float;
@@ -185,7 +185,6 @@ int main(int argc, char* argv[])
    auto a_element_op = AElementOp{};
    auto b_element_op = BElementOp{};
    auto c_element_op = CElementOp{};
-
    // do GEMM
    auto gemm     = DeviceGemmInstance{};
    auto invoker  = gemm.MakeInvoker();
@@ -209,8 +208,7 @@ int main(int argc, char* argv[])
        return 0;
    }

-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-
+    float ave_time   = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_btype =
        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -29,7 +29,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_WaveletM
 // ######|        |        |        |      Type|      Type|        Type|         DataType|      Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| ThreadGroupSize| ThreadGroupSize| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |            |                 |          |   Operation|   Operation|   Operation|               |    Stage|                |                |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |            |                 |          |            |            |            |               |         |                |                |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType,              F16, CDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,             256,             256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,                S<1, 32, 1,8>,               8>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType,              F16, CDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,             256,             256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,                S<1, 32, 1,8>,               4>;
 // clang-format on

 using DeviceGemmInstance = DeviceGemmInstance;
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -2,7 +2,11 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-#include "ck/library/utility/validation_common.hpp"
+
+// use macro to minimize code change
+#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
+using ComputeDataType = AccDataType;
+#endif

 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
@@ -24,11 +28,11 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
            }
        };

@@ -54,17 +58,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});

-    try
-    {
-        ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-            M, N, K, StrideA, StrideB, StrideC);
-    }
-    catch(const std::runtime_error& e)
-    {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return false;
-    }
-
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));

@@ -218,8 +211,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        pass &= ck::utils::check_err(c_m_n_device_result,
                                     c_m_n_host_result,
                                     "Error: Incorrect results!",
-                                     get_rtol<CDataType>(),
-                                     get_atol<CDataType>());
+                                     get_rtol<CDataType, ComputeDataType>(),
+                                     get_atol<CDataType, ComputeDataType>());
 #endif
    }

@@ -249,8 +242,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        pass &= ck::utils::check_err(c_m_n_device_result,
                                     c_m_n_device_ref_result,
                                     "Error: Incorrect results!",
-                                     get_rtol<CDataType>(),
-                                     get_atol<CDataType>());
+                                     get_rtol<CDataType, ComputeDataType>(),
+                                     get_atol<CDataType, ComputeDataType>());
    }

    return pass == true;
--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -87,10 +87,10 @@ using DeviceOpInstance =
                                                                   32,
                                                                   8,
                                                                   8,
-                                                                   32,
-                                                                   32,
+                                                                   16,
+                                                                   16,
+                                                                   8,
                                                                   4,
-                                                                   2,
                                                                   S<4, 64, 1>,
                                                                   S<1, 0, 2>,
                                                                   S<1, 0, 2>,
@@ -108,7 +108,7 @@ using DeviceOpInstance =
                                                                   1,
                                                                   1,
                                                                   S<1, 32, 1, 8>,
-                                                                   8>;
+                                                                   4>;

 int main(int argc, char* argv[])
 {
--- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -83,10 +83,10 @@ using DeviceOpInstance =
                                                                   32,
                                                                   8,
                                                                   8,
-                                                                   32,
-                                                                   32,
+                                                                   16,
+                                                                   16,
+                                                                   8,
                                                                   4,
-                                                                   2,
                                                                   S<4, 64, 1>,
                                                                   S<1, 0, 2>,
                                                                   S<1, 0, 2>,
@@ -104,7 +104,7 @@ using DeviceOpInstance =
                                                                   1,
                                                                   1,
                                                                   S<1, 32, 1, 8>,
-                                                                   8>;
+                                                                   4>;

 int main(int argc, char* argv[])
 {
@@ -113,13 +113,13 @@ int main(int argc, char* argv[])
    bool time_kernel     = false;

    // GEMM shape
-    ck::index_t M = 3840;
-    ck::index_t N = 4096;
-    ck::index_t K = 4096;
+    ck::index_t M = 1920;
+    ck::index_t N = 2048;
+    ck::index_t K = 2048;

-    ck::index_t StrideA = 4096;
-    ck::index_t StrideB = 4096;
-    ck::index_t StrideE = 4096;
+    ck::index_t StrideA = 2048;
+    ck::index_t StrideB = 2048;
+    ck::index_t StrideE = 2048;

    if(argc == 1)
    {
@@ -174,6 +174,9 @@ int main(int argc, char* argv[])
    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));

+    const auto StrideD = std::is_same<decltype(ELayout{}), ck::tensor_layout::gemm::RowMajor>::value
+                             ? d_m_n.mDesc.GetStrides()[0]
+                             : d_m_n.mDesc.GetStrides()[1];
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
@@ -221,7 +224,7 @@ int main(int argc, char* argv[])
                               K,
                               StrideA,
                               StrideB,
-                               std::array<ck::index_t, 1>{0},
+                               std::array<ck::index_t, 1>{static_cast<int>(StrideD)},
                               StrideE,
                               a_element_op,
                               b_element_op,
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -31,7 +31,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               2>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
@@ -7,7 +7,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
 #endif
    using namespace ck::literals;

-    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+    ProblemSize ps =
+        problem_size; // make mutable copy because default stride values of 0 need to be updated
+    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;

    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
@@ -41,6 +43,30 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;

+    // If any user-provided leading stride <= 0, replace it with the one determined by the
+    // created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1.
+    auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int {
+        if constexpr(std::is_same_v<decltype(layout_tag), ck::tensor_layout::gemm::RowMajor>)
+        {
+            return static_cast<int>(tensor.GetStrides()[0]);
+        }
+        else
+        {
+            return static_cast<int>(tensor.GetStrides()[1]);
+        }
+    };
+
+    if(StrideA <= 0)
+        StrideA = fetch_leading_stride(a_m_k, ALayout{});
+    if(StrideB <= 0)
+        StrideB = fetch_leading_stride(b_k_n, BLayout{});
+    if(StrideD0 <= 0)
+        StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{});
+    if(StrideD1 <= 0)
+        StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{});
+    if(StrideE <= 0)
+        StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{});
+
    switch(config.init_method)
    {
    case 0: break;
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -19,4 +19,13 @@ foreach(gpu IN LISTS GPU_TARGETS)
        add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
        set(target 1)
    endif()
-endforeach()
+endforeach()
+
+list(APPEND gpu_list_tf32 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
+        add_example_executable(example_convnd_fwd_xdl_fp32_tf32 convnd_fwd_xdl_fp32_tf32.cpp)
+        set(target 1)
+    endif()
+endforeach()
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -27,10 +27,14 @@ void print_helper_msg()
              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
 }

-template <typename DataType>
+template <typename DataType, typename GemmType = DataType>
 inline __host__ __device__ constexpr double get_rtol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
+    {
+        return 5e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
    {
        return 1e-3;
    }
@@ -68,10 +72,14 @@ inline __host__ __device__ constexpr double get_rtol()
    }
 }

-template <typename DataType>
+template <typename DataType, typename GemmType = DataType>
 inline __host__ __device__ constexpr double get_atol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
+    {
+        return 1e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
    {
        return 1e-3;
    }
@@ -116,7 +124,8 @@ template <ck::index_t NDimSpatial,
          typename InElementOp,
          typename WeiElementOp,
          typename OutElementOp,
-          typename DeviceConvNDFwdInstance>
+          typename DeviceConvNDFwdInstance,
+          typename ComputeDataType = OutDataType>
 bool run_grouped_conv_fwd(bool do_verification,
                          int init_method,
                          bool time_kernel,
@@ -228,7 +237,11 @@ bool run_grouped_conv_fwd(bool do_verification,
                                                                     OutDataType,
                                                                     InElementOp,
                                                                     WeiElementOp,
-                                                                     OutElementOp>();
+                                                                     OutElementOp,
+                                                                     0,
+                                                                     0,
+                                                                     0,
+                                                                     ComputeDataType>();

        auto ref_invoker  = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(in,
@@ -249,8 +262,8 @@ bool run_grouped_conv_fwd(bool do_verification,
        return ck::utils::check_err(out_device,
                                    out_host,
                                    "Error: incorrect results!",
-                                    get_rtol<OutDataType>(),
-                                    get_atol<OutDataType>());
+                                    get_rtol<OutDataType, ComputeDataType>(),
+                                    get_atol<OutDataType, ComputeDataType>());
    }

    return true;
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -72,7 +72,7 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8>;
+        4>;

 #include "run_convnd_fwd_example.inc"

--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,9 +73,17 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        ComputeType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -53,10 +53,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -74,10 +74,18 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        AComputeType,
        BComputeType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -72,7 +72,7 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8>;
+        4>;

 #include "run_convnd_fwd_example.inc"

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,9 +73,17 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        ComputeType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // fp8 are not supported on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -76,4 +76,11 @@ using DeviceGroupedConvNDFwdInstance =

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32_tf32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32_tf32.cpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
+using InDataType       = float;
+using WeiDataType      = float;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using OutDataType      = float;
+using ComputeDataType  = ck::tf32_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        ck::Tuple<>,      // DsLayout
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        ck::Tuple<>,      // DsDataType
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        1,                // NumGemmKPrefetchStage
+        256,              // BlockSize
+        128,              // MPerBlock
+        192,              // NPerBlock
+        16,               // KPerBlock
+        4,                // AK1
+        4,                // BK1
+        32,               // MPerXdl
+        32,               // NPerXdl
+        2,                // MXdlPerWave
+        3,                // NXdlPerWave
+        S<4, 64, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        4,                // ABlockTransferSrcScalarPerVector
+        4,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 64, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        4,                // BBlockTransferSrcScalarPerVector
+        4,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMXdlPerWavePerShuffle
+        1,                // CShuffleNXdlPerWavePerShuffle
+        S<1, 16, 1, 16>,  // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        4,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ComputeDataType,  // AComputeDataType
+        ComputeDataType,  // BComputeDataType
+        ck::LoopScheduler::Default, // LoopScheduler
+        1                           // NumGroupsToMerge
+        >;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -7,6 +7,8 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
 using InDataType       = ck::f8_t;
 using WeiDataType      = ck::f8_t;
 using AccDataType      = float;
@@ -52,10 +54,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,9 +75,19 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        ComputeDataType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -53,10 +53,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -74,10 +74,18 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        AComputeType,
        BComputeType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
        64,          // KPerBlock
        16,          // AK1
        16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -71,8 +71,8 @@ using DeviceGroupedConvNDFwdInstance =
        1,           // BBlockLdsExtraN
        1,
        1,
-        S<1, 64, 1, 4>,
-        16>;
+        S<1, 32, 1, 8>,
+        4>;

 #include "run_convnd_fwd_example.inc"

--- a/example/09_convnd_fwd/run_convnd_fwd_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_example.inc
@@ -3,6 +3,11 @@

 #pragma once

+// use macro to minimize code change
+#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
+using ComputeDataType = AccDataType;
+#endif
+
 bool run_convnd_fwd_example(int argc, char* argv[])
 {
    print_helper_msg();
@@ -65,17 +70,17 @@ bool run_convnd_fwd_example(int argc, char* argv[])
            InElementOp,
            WeiElementOp,
            OutElementOp,
-            DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
+            DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>,
+            ComputeDataType>(do_verification,
+                             init_method,
+                             time_kernel,
+                             conv_param,
+                             in_g_n_c_wis_desc,
+                             wei_g_k_c_xs_desc,
+                             out_g_n_k_wos_desc,
+                             in_element_op,
+                             wei_element_op,
+                             out_element_op);
    };

    namespace ctc = ck::tensor_layout::convolution;
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
@@ -125,7 +125,7 @@ inline bool parse_cmd_args(int argc,

        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
        problem_size                      = ck::utils::conv::parse_conv_param(
-            num_dim_spatial, threshold_to_catch_partial_args, argv);
+            num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
    }
    else
    {
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -15,4 +15,11 @@ using RsDataType        = ck::Tuple<R0DataType>;

 #include "run_convnd_fwd_max_example.inc"

-int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return !run_convnd_fwd_max_example(argc, argv);
+}
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
@@ -23,7 +23,7 @@ using RsGlobalReduceOp =
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

 // clang-format off
 template <ck::index_t NDimSpatial>
@@ -36,7 +36,7 @@ using DeviceInstance =
 #ifdef BUILD_INT4_EXAMPLE
        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
 #else
-        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>,       ADataType,       BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>,       ADataType,       BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 #endif

 template <ck::index_t NDimSpatial>
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
    const std::array<int, 2> reduceDims = {3, 4};
    // const std::array<int, 3> invariantDims = {0, 1, 2};

-    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
+    std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};

    // input lengths of the second reduction, which is also the output lengths of the first
    // reduction
-    const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
+    std::vector<size_t> inLengths_2 = {64, 320, 80, 4};

-    const std::vector<size_t> outLengths = {64, 320, 80};
+    std::vector<size_t> outLengths = {64, 320, 80};

    if(argc == 1)
    {
@@ -114,11 +114,26 @@ int main(int argc, char* argv[])
        init_method = 2;
        time_kernel = true;
    }
-    else if(argc == 4)
+    else if((argc == 4) || (argc == 9))
    {
        do_verify   = static_cast<bool>(argv[1]);
        init_method = atoi(argv[2]);
        time_kernel = static_cast<bool>(atoi(argv[3]));
+        if(argc == 9)
+        {
+            inLengths_1[0] = atoi(argv[4]);
+            inLengths_1[1] = atoi(argv[5]);
+            inLengths_1[2] = atoi(argv[6]);
+            inLengths_1[3] = atoi(argv[7]);
+            inLengths_1[4] = atoi(argv[8]);
+            inLengths_2[0] = inLengths_1[0];
+            inLengths_2[1] = inLengths_1[1];
+            inLengths_2[2] = inLengths_1[2];
+            inLengths_2[3] = inLengths_1[3];
+            outLengths[0]  = inLengths_1[0];
+            outLengths[1]  = inLengths_1[1];
+            outLengths[2]  = inLengths_1[2];
+        }
    }
    else
    {
--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -78,12 +78,12 @@ bool pool_test(bool do_verification,

            if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
            {
-                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz}, layout);
            }
            else if constexpr(ck::is_same<decltype(layout),
                                          ck::tensor_layout::convolution::NHWC>::value)
            {
-                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}, layout);
            }
        };

--- a/example/14_gemm_quantization/CMakeLists.txt
+++ b/example/14_gemm_quantization/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
+add_example_executable(example_gemm_wmma_quantization_int8 gemm_wmma_quantization_int8.cpp)
 add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
 add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
--- a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
@@ -115,12 +115,14 @@ int main()
            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1_uz}));
+                                            std::vector<std::size_t>({stride, 1_uz}),
+                                            layout);
            }
            else
            {
                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1_uz, stride}));
+                                            std::vector<std::size_t>({1_uz, stride}),
+                                            layout);
            }
        };

--- a/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = I8;
+
+using ALayout  = Col;
+using BLayout  = Row;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffleV3<
+    ALayout,
+    BLayout,
+    DsLayout,
+    ELayout,
+    ADataType,
+    BDataType,
+    DsDataType,
+    EDataType,
+    AccDataType,
+    CShuffleDataType,
+    ActivationOp,
+    ActivationOp,
+    CDEElementOp,
+    GemmDefault,
+    256,
+    128,
+    128,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<4, 64, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    true,
+    S<4, 64, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    true,
+    1,
+    1,
+    S<1, 32, 1, 8>,
+    S<1>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v1,
+    I8,
+    I8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
+
+int main(int /* argc */, char* /* argv */[])
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideE = N;
+
+    float requant_scale = 0.03;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op   = PassThrough{};
+    auto b_element_op   = PassThrough{};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    // device GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                      std::array<const void*, 0>{},
+                                      static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      std::array<ck::index_t, 0>{},
+                                      StrideE,
+                                      1,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -70,10 +70,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     64,                         // KPerBlock,
     16,                         // AK1,
     16,                         // BK1,
-     32,                         // MPerXDL,
-     32,                         // NPerXDL,
-     4,                          // MXdlPerWave,
-     2,                          // NXdlPerWave,
+     16,                         // MPerXDL,
+     16,                         // NPerXDL,
+     8,                          // MXdlPerWave,
+     4,                          // NXdlPerWave,
     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
@@ -90,8 +90,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     1,                          // bool BBlockLdsExtraN,
     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
-     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     4>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -68,10 +68,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     64,                         // KPerBlock,
     16,                         // AK1,
     16,                         // BK1,
-     32,                         // MPerXDL,
-     32,                         // NPerXDL,
-     4,                          // MXdlPerWave,
-     2,                          // NXdlPerWave,
+     16,                         // MPerXDL,
+     16,                         // NPerXDL,
+     8,                          // MXdlPerWave,
+     4,                          // NXdlPerWave,
     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
@@ -88,8 +88,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     1,                          // bool BBlockLdsExtraN,
     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
-     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     4>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -63,7 +63,7 @@ using DeviceGemmInstance =
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,    S<4, 64, 1>,     S<1, 0, 2>,      S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,       S<4,4,4>>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   16,   16,    2,    4,    S<4, 64, 1>,     S<1, 0, 2>,      S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,       S<4,4,4>>;
 // clang-format on

 struct ProblemSize final
--- a/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on

 #include "run_grouped_gemm_example.inc"
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
@@ -323,6 +323,31 @@ int main(int argc, char* argv[])

    problem_size.Ms = {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};

+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.k_batch           = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (>0)\n");
+        printf("arg5: group count (default=16)");
+        exit(0);
+    }
+
    for(int i = 0; i < problem_size.group_count; i++)
    {
        problem_size.Ns.push_back(768);
@@ -333,21 +358,5 @@ int main(int argc, char* argv[])
        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
    }

-    if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.k_batch         = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: k_batch (>0)\n");
-        exit(0);
-    }
-
    return !run_grouped_gemm(problem_size, config);
 }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
@@ -296,6 +296,32 @@ int main(int argc, char* argv[])

    problem_size.group_count = 16;

+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.k_batch           = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (> 0)\n");
+        printf("arg5: group count (default=16)");
+
+        exit(0);
+    }
+
    for(int i = 0; i < problem_size.group_count; i++)
    {
        problem_size.Ms.push_back(128 + rand() % 128);
@@ -307,21 +333,5 @@ int main(int argc, char* argv[])
        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
    }

-    if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.k_batch         = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: k_batch (> 0)\n");
-        exit(0);
-    }
-
    return !run_grouped_gemm(problem_size, config);
 }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
@@ -297,6 +297,31 @@ int main(int argc, char* argv[])

    problem_size.group_count = 16;

+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.k_batch           = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (> 0)\n");
+        printf("arg5: group count (default=16)");
+        exit(0);
+    }
+
    for(int i = 0; i < problem_size.group_count; i++)
    {
        problem_size.Ms.push_back(256 + 256 * i);
@@ -308,21 +333,5 @@ int main(int argc, char* argv[])
        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
    }

-    if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.k_batch         = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: k_batch (> 0)\n");
-        exit(0);
-    }
-
    return !run_grouped_gemm(problem_size, config);
 }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on

 #include "run_grouped_gemm_example.inc"
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               2>;
 // clang-format on

 #include "run_grouped_gemm_example.inc"
--- a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
@@ -66,6 +66,28 @@ int main(int argc, char* argv[])

    problem_size.group_count = 16;

+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 5)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        problem_size.group_count = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: group count (default=16)");
+        exit(0);
+    }
+
    for(int i = 0; i < problem_size.group_count; i++)
    {
        problem_size.Ms.push_back(256 + 256 * i);
@@ -77,19 +99,5 @@ int main(int argc, char* argv[])
        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
    }

-    if(argc == 4)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        exit(0);
-    }
-
    return !run_grouped_gemm(problem_size, config);
 }
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -278,6 +278,30 @@ bool run_grouped_gemm_example(int argc, char* argv[])

    problem_size.group_count = 16;

+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.async_hargs       = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: async hargs (0=n0, 1=yes)\n");
+        printf("arg5: group count (default=16)");
+        exit(0);
+    }
+
    for(int i = 0; i < problem_size.group_count; i++)
    {
        problem_size.Ms.push_back(256 + 256 * i);
@@ -288,27 +312,6 @@ bool run_grouped_gemm_example(int argc, char* argv[])
        problem_size.stride_Bs.push_back(problem_size.Ks[i]);
        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
    }
-    if(argc == 4)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.async_hargs     = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: async hargs (0=n0, 1=yes)\n");
-        exit(0);
-    }

    return run_grouped_gemm(problem_size, config);
 }
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -76,7 +76,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
 //######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -72,10 +72,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         64,                        // KPerBlock
         16,                        // AK1
         16,                        // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -65,10 +65,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         32,                        // KPerBlock
         8,                         // AK1
         8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -85,7 +85,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -65,10 +65,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         32,                        // KPerBlock
         8,                         // AK1
         8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -85,7 +85,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -146,6 +146,11 @@ int main(int argc, char* argv[])
        exit(0);
    }

+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+
    return run_gemm_reduce_max_xdl<ADataType,
                                   BDataType,
                                   EDataType,
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -64,10 +64,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         64,                        // KPerBlock
         16,                        // AK1
         16,                        // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -84,7 +84,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -72,10 +72,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         32,                        // KPerBlock
         8,                         // AK1
         8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -72,10 +72,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         32,                        // KPerBlock
         8,                         // AK1
         8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -92,7 +92,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -153,6 +153,11 @@ int main(int argc, char* argv[])
        exit(EXIT_SUCCESS);
    }

+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        exit(EXIT_SUCCESS);
+    }
+
    return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
                                                BDataType,
                                                EDataType,
--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
@@ -1,8 +1 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
-   set(target 1)
- endif()
-endforeach()
+add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -64,7 +64,7 @@ using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatc
 //######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|               |                |   MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
 //######|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|               |                |    Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
 //######|        |        |        |     |      |      |         |         |          |              |            |            |            |            |               |                |             |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal,  AElementOp,  BElementOp,  CElementOp, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal,  AElementOp,  BElementOp,  CElementOp, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4,             S<32, 8>,                         4,                            1>;
 // clang-format on

 using ReferenceBatchedGemmInstance =
@@ -137,11 +137,13 @@ int main(int argc, char* argv[])

        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
-            return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {row * stride, stride, 1_uz}, layout);
        }
        else
        {
-            return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {col * stride, 1_uz, stride}, layout);
        }
    };

--- a/example/20_grouped_conv_bwd_weight/common.hpp
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
@@ -123,7 +123,9 @@ inline bool parse_cmd_args(int argc,

        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
        conv_param                        = ck::utils::conv::parse_conv_param(
-            num_dim_spatial, threshold_to_catch_partial_args, argv);
+            num_dim_spatial,
+            threshold_to_catch_partial_args + 1, // +1 because we already parsed num_dim_spatial
+            argv);
    }
    else
    {
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -80,7 +80,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
 //######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
@@ -236,7 +236,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
              << " GB/s, " << std::endl;
 }

-int main()
+int main(int argc, char* argv[])
 {
    // GEMM shape
    ck::index_t M = 1024;
@@ -249,6 +249,25 @@ int main()
    ck::index_t StrideD1 = 1024;
    ck::index_t StrideE  = 1024;

+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[2]));
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
    Tensor<D0DataType> bias_n(f_host_tensor_descriptor1d(N, 1));
@@ -357,6 +376,7 @@ int main()
    normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});

    bool pass = true;
+    if(do_verification)
    {
        // verification
        Tensor<LayerNormOutDataType> host_layerNorm_m_n(
@@ -383,27 +403,25 @@ int main()
                                     1e-2);
    }

+    if(time_kernel)
    {
        // evaluate kernel perf
-        bool time_kernel = true;
-
        float gemm_reduce_mean_reduce_square_mean_ave_time =
-            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
+            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
        float normalize_ave_time =
-            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});

-        if(time_kernel)
-            DumpGemmLayerNormPerf<ADataType,
-                                  BDataType,
-                                  EDataType,
-                                  D0DataType,
-                                  D1DataType,
-                                  R0DataType,
-                                  R1DataType,
-                                  GammaDataType,
-                                  BetaDataType,
-                                  LayerNormOutDataType>(
-                gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
+        DumpGemmLayerNormPerf<ADataType,
+                              BDataType,
+                              EDataType,
+                              D0DataType,
+                              D1DataType,
+                              R0DataType,
+                              R1DataType,
+                              GammaDataType,
+                              BetaDataType,
+                              LayerNormOutDataType>(
+            gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
    }

    return pass ? 0 : 1;
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -65,7 +65,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDLayern
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|             Type|          Type|         Type|      Type| Elementwise| Elementwise|  Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|  ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|
 //######|        |        |         |        |          |          |            |                 |           |                 |              |             |          |   Operation|   Operation|    Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                  _M_N|            _M_N|                 _M_N|              _M|
 //######|        |        |         |        |          |          |            |                 |           |                 |              |             |          |            |            |             |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                      |                |                     |                |
-        < ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType,  AElementOp,  BElementOp, CDEElementOp,   HElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<32, 8>,               8,             S<8, 32>,               8>;
+        < ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType,  AElementOp,  BElementOp, CDEElementOp,   HElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<32, 8>,               4,             S<8, 32>,               4>;
 // clang-format on

 auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
@@ -154,6 +154,12 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,

 int main()
 {
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+
    bool do_verification = true;

    // GEMM shape
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -77,7 +77,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
 //######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
@@ -221,7 +221,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
              << " GB/s, " << std::endl;
 }

-int main()
+int main(int argc, char* argv[])
 {
    // GEMM shape
    ck::index_t M = 1024;
@@ -232,6 +232,25 @@ int main()
    ck::index_t StrideB = 1024;
    ck::index_t StrideE = 1024;

+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[2]));
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
    Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
@@ -333,6 +352,7 @@ int main()
    normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});

    bool pass = true;
+    if(do_verification)
    {
        // verification
        Tensor<LayerNormOutDataType> host_layerNorm_m_n(
@@ -354,25 +374,23 @@ int main()
            layerNorm_m_n, host_layerNorm_m_n, "Error: Incorrect results d1", 1e-3, 1e-3);
    }

+    if(time_kernel)
    {
        // evaluate kernel perf
-        bool time_kernel = true;
-
        float gemm_reduce_mean_reduce_square_mean_ave_time =
-            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
+            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
        float normalize_ave_time =
-            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});

-        if(time_kernel)
-            DumpGemmLayerNormPerf<ADataType,
-                                  BDataType,
-                                  EDataType,
-                                  R0DataType,
-                                  R1DataType,
-                                  GammaDataType,
-                                  BetaDataType,
-                                  LayerNormOutDataType>(
-                gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
+        DumpGemmLayerNormPerf<ADataType,
+                              BDataType,
+                              EDataType,
+                              R0DataType,
+                              R1DataType,
+                              GammaDataType,
+                              BetaDataType,
+                              LayerNormOutDataType>(
+            gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
    }

    return pass ? 0 : 1;
--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -70,7 +70,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl
 //######|        |        |        |      Type|      Type|      Type|       Type|    DataType|         DataType|    DataType| Elementwise| Elementwise|  Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector|
 //######|        |        |        |          |          |          |           |            |                 |            |   Operation|   Operation|    Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|            _NPerBlock|
 //######|        |        |        |          |          |          |           |            |                 |            |            |            |             |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                      |
-        <     Row,     Col,     Row, ADataType, BDataType, CDataType, C0DataType, AccDataType, CShuffleDataType, AccDataType,  AElementOp,  BElementOp, AccElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8,             S<64, 4>,                     4>;
+        <     Row,     Col,     Row, ADataType, BDataType, CDataType, C0DataType, AccDataType, CShuffleDataType, AccDataType,  AElementOp,  BElementOp, AccElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           4,               S<1, 32, 1, 8>,               8,             S<32, 8>,                     4>;
 // clang-format on

 using ReferenceInstance = ck::tensor_operation::host::ReferenceGemmLayernorm<ADataType,
--- a/example/22_cgemm/cgemm_xdl_bf16.cpp
+++ b/example/22_cgemm/cgemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>

@@ -48,10 +48,10 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
     32,                         // index_t KPerBlock
     8,                          // index_t AK1
     8,                          // index_t BK1
-     32,                         // index_t MPerXDL
-     32,                         // index_t NPerXDL
-     4,                          // index_t MXdlPerWave
-     2,                          // index_t NXdlPerWave
+     16,                         // index_t MPerXDL
+     16,                         // index_t NPerXDL
+     8,                          // index_t MXdlPerWave
+     4,                          // index_t NXdlPerWave
     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
@@ -69,7 +69,7 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
     1,                          // index_t CShuffleMXdlPerWavePerShuffle
     1,                          // index_t CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+     4>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
 // clang-format on

 int main(int argc, char* argv[])
--- a/example/22_cgemm/cgemm_xdl_fp16.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>

@@ -47,10 +47,10 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
     32,                         // index_t KPerBlock
     8,                          // index_t AK1
     8,                          // index_t BK1
-     32,                         // index_t MPerXDL
-     32,                         // index_t NPerXDL
-     4,                          // index_t MXdlPerWave
-     2,                          // index_t NXdlPerWave
+     16,                         // index_t MPerXDL
+     16,                         // index_t NPerXDL
+     8,                          // index_t MXdlPerWave
+     4,                          // index_t NXdlPerWave
     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
@@ -68,7 +68,7 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
     1,                          // index_t CShuffleMXdlPerWavePerShuffle
     1,                          // index_t CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+     4>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
 // clang-format on

 int main(int argc, char* argv[])
--- a/Show More
+++ b/Show More