diff --git a/.gitignore b/.gitignore
index 17f93500bd..04ac34466f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -112,4 +112,30 @@ test_data/*
 experimental/grouped_convolution_tile_instances/instances/*
 !experimental/grouped_convolution_tile_instances/instances/*.in
 !experimental/grouped_convolution_tile_instances/instances/*.inc
+!experimental/grouped_convolution_tile_instances/instances/*.hpp
 experimental/grouped_convolution_tile_instances/*.inc
+# Heuristics: benchmark data (never in git)
+dispatcher/heuristics/data/
+
+# Heuristics: experimental/training artifacts (exclude from git)
+dispatcher/heuristics/models/**/oof_predictions.parquet
+dispatcher/heuristics/models/**/cv_metrics_*.json
+dispatcher/heuristics/models/**/eval_report.json
+dispatcher/heuristics/models/**/feature_importances_*.json
+dispatcher/heuristics/models/**/model_tflops_ihem.lgbm
+dispatcher/heuristics/models/**/model_tflops_log.lgbm
+dispatcher/heuristics/models/**/model_tflops_log_big.lgbm
+
+# Heuristics: keep in git (production model files):
+#   models/{op}_{dtype}_{arch}/model_tflops.lgbm
+#   models/{op}_{dtype}_{arch}/model_latency.lgbm
+#   models/{op}_{dtype}_{arch}/model_bandwidth.lgbm
+#   models/{op}_{dtype}_{arch}/feature_spec.json
+#   models/{op}_{dtype}_{arch}/train_manifest.json
+
+# Heuristics: logs and caches
+dispatcher/heuristics/*.log
+dispatcher/heuristics/__pycache__/
+dispatcher/heuristics/tests/__pycache__/
+dispatcher/heuristics/.pytest_cache/
+
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index b3299fa4e8..50fa167b41 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -4,13 +4,13 @@
 version: 2
 
 sphinx:
-   configuration: docs/conf.py
+   configuration: projects/composablekernel/docs/conf.py
 
 formats: [htmlzip, pdf, epub]
 
 python:
    install:
-   - requirements: docs/sphinx/requirements.txt
+   - requirements: projects/composablekernel/docs/sphinx/requirements.txt
 
 build:
    os: ubuntu-22.04
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 370e9e4243..f6812a8520 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added FP8 block scale quantization for FMHA forward kernel.
 * Added gfx11 support for FMHA.
 * Added microscaling (MX) FP8/FP4 support on gfx950 for FMHA forward kernel ("qr" pipeline only).
+* Added FP8 per-tensor quantization support for FMHA forward V3 pipeline on gfx950.
 
 ### Changed
 
diff --git a/Dockerfile b/Dockerfile
index f19bc69362..de129d0703 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,22 +2,33 @@
 FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=7.1.1
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
+ARG TARBALL_URL=https://rocm.nightlies.amd.com/tarball/therock-dist-linux-gfx90X-dcgpu-7.12.0a20260218.tar.gz
 ARG compiler_version=""
 ARG compiler_commit=""
-ARG CK_SCCACHE=""
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH=$PATH:/opt/rocm/bin
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib
+ENV HIP_PLATFORM=amd
 
 # Add rocm repository
 RUN set -xe && \
     apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 
-RUN wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb && \
-    apt install ./amdgpu-install_7.1.1.70101-1_all.deb -y && \
-    apt update && \
-    apt install python3-setuptools python3-wheel -y && \
-    apt install rocm-dev -y
+RUN if [ "$compiler_version" = "therock" ]; then \
+        rm -rf /opt/rocm && mkdir /opt/rocm && \
+        echo "Downloading ROCm tarball from $TARBALL_URL..." && \
+        wget -q -O /tmp/rocm.tar.gz "$TARBALL_URL" && \
+        echo "Extracting tarball to /opt/rocm..." && \
+        tar -xzf /tmp/rocm.tar.gz -C /opt/rocm --strip-components=1 ; \
+    else echo "using the release compiler" && \
+        wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb && \
+        apt install ./amdgpu-install_7.1.1.70101-1_all.deb -y && \
+        apt update && \
+        apt install python3-setuptools python3-wheel -y && \
+        apt install rocm-dev -y; \
+    fi
     
 # Install SCCACHE
 ENV SCCACHE_VERSION="0.14.0"
@@ -34,7 +45,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     build-essential \
     cmake \
     git \
-    hip-rocclr \
     iputils-ping \
     jq \
     libelf-dev \
@@ -44,8 +54,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     net-tools \
     pkg-config \
     python3-full \
+    python3-pip \
     redis \
-    rocm-llvm-dev \
     sshpass \
     stunnel \
     software-properties-common \
@@ -88,26 +98,3 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     git clone -b master https://github.com/ROCm/rocm-cmake.git  && \
     cd rocm-cmake && mkdir build && cd build && \
     cmake  .. && cmake --build . && cmake --build . --target install
-
-WORKDIR /
-# Add alternative compilers, if necessary
-ENV compiler_version=$compiler_version
-ENV compiler_commit=$compiler_commit
-RUN sh -c "echo compiler version = '$compiler_version'" && \
-    sh -c "echo compiler commit = '$compiler_commit'"
-
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
-        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
-        cd llvm-project && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 8 ; \
-    else echo "using the release compiler"; \
-    fi
-
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
-        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
-        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 8 ; \
-    else echo "using the release compiler"; \
-    fi
diff --git a/Dockerfile.aiter b/Dockerfile.aiter
index a5a3f81fca..ebfef41643 100644
--- a/Dockerfile.aiter
+++ b/Dockerfile.aiter
@@ -4,7 +4,7 @@ ARG AITER_BRANCH="main"
 ARG CK_AITER_BRANCH="develop"
 # CK_FROM_ROCM_LIBRARIES - 1: CK from rocm-libraries sparse-checkout; 0: direct clone from ROCm/composable_kernel
 ARG CK_FROM_ROCM_LIBRARIES=1
-RUN pip install pandas zmq einops ninja tabulate && \
+RUN pip install pandas zmq einops ninja tabulate vcs_versioning && \
     pip install numpy==1.26.2 && \
     sudo mkdir /home/jenkins && \
     sudo mkdir /home/jenkins/workspace && \
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
index 8f5503d79e..9d1e54106e 100644
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -9,18 +9,70 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'" && \
     sh -c "echo compiler commit = '$compiler_commit'"
 
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
+RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] ) && [ "$compiler_commit" = "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
-        cd llvm-project && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 16 ; \
+        cd llvm-project && git log -1 && mkdir build && cd build && \
+        cmake -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang;lld;clang-tools-extra;flang;mlir" \
+        -DLLVM_LIT_ARGS="-vv --show-unsupported --show-xfail -j 32" \
+        -DPACKAGE_VENDOR="AMD" \
+        -DCMAKE_INSTALL_PREFIX=/home/$USER/rocm/pure_llvm_1.0 \
+        -DLLVM_ENABLE_ASSERTIONS=ON \
+        -DLLVM_BUILD_DOCS=ON \
+        -DLLVM_TARGETS_TO_BUILD=all \
+        -DLIBOMPTARGET_ENABLE_DEBUG=ON \
+        -DOFFLOAD_ENABLE_EMISSARY_APIS=OFF \
+        -DCLANG_DEFAULT_LINKER=lld \
+        -DCLANG_DEFAULT_PIE_ON_LINUX=0 \
+        -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;openmp;compiler-rt;libunwind;flang-rt" \
+        -DLIBCXX_ENABLE_SHARED=OFF \
+        -DLIBCXX_ENABLE_STATIC=ON \
+        -DLIBCXX_INSTALL_LIBRARY=OFF \
+        -DLIBCXX_INSTALL_HEADERS=OFF \
+        -DLIBCXXABI_ENABLE_SHARED=OFF \
+        -DLIBCXXABI_ENABLE_STATIC=ON \
+        -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF \
+        -DLLVM_ENABLE_ASSERTIONS=1 \
+        -DLLVM_ENABLE_Z3_SOLVER=OFF \
+        -DLLVM_ENABLE_ZLIB=ON \
+        -DLLVM_LINK_LLVM_DYLIB=OFF \
+        -DCLANG_LINK_CLANG_DYLIB=OFF \
+        ../llvm && \
+        ninja -j16 ; \
     else echo "using the release compiler"; \
     fi
 
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
+RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] ) && [ "$compiler_commit" != "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
         cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 16 ; \
+        cmake -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang;lld;clang-tools-extra;flang;mlir" \
+        -DLLVM_LIT_ARGS="-vv --show-unsupported --show-xfail -j 32" \
+        -DPACKAGE_VENDOR="AMD" \
+        -DCMAKE_INSTALL_PREFIX=/home/$USER/rocm/pure_llvm_1.0 \
+        -DLLVM_ENABLE_ASSERTIONS=ON \
+        -DLLVM_BUILD_DOCS=ON \
+        -DLLVM_TARGETS_TO_BUILD=all \
+        -DLIBOMPTARGET_ENABLE_DEBUG=ON \
+        -DOFFLOAD_ENABLE_EMISSARY_APIS=OFF \
+        -DCLANG_DEFAULT_LINKER=lld \
+        -DCLANG_DEFAULT_PIE_ON_LINUX=0 \
+        -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;openmp;compiler-rt;libunwind;flang-rt" \
+        -DLIBCXX_ENABLE_SHARED=OFF \
+        -DLIBCXX_ENABLE_STATIC=ON \
+        -DLIBCXX_INSTALL_LIBRARY=OFF \
+        -DLIBCXX_INSTALL_HEADERS=OFF \
+        -DLIBCXXABI_ENABLE_SHARED=OFF \
+        -DLIBCXXABI_ENABLE_STATIC=ON \
+        -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF \
+        -DLLVM_ENABLE_ASSERTIONS=1 \
+        -DLLVM_ENABLE_Z3_SOLVER=OFF \
+        -DLLVM_ENABLE_ZLIB=ON \
+        -DLLVM_LINK_LLVM_DYLIB=OFF \
+        -DCLANG_LINK_CLANG_DYLIB=OFF \
+        ../llvm && \
+        ninja -j16 ; \
     else echo "using the release compiler"; \
     fi
diff --git a/Dockerfile.fa b/Dockerfile.fa
new file mode 100644
index 0000000000..c5cbacfc16
--- /dev/null
+++ b/Dockerfile.fa
@@ -0,0 +1,43 @@
+ARG BASE_DOCKER="rocm/pytorch:latest"
+FROM $BASE_DOCKER
+ARG FA_ORIGIN="ROCm"
+ARG FA_BRANCH="tridao"
+ARG CK_FA_ORIGIN="ROCm"
+ARG CK_FA_BRANCH="develop"
+# CK_FROM_ROCM_LIBRARIES - 1: CK from rocm-libraries sparse-checkout; 0: direct clone from ROCm/composable_kernel
+ARG CK_FROM_ROCM_LIBRARIES=1
+ARG GPU_ARCHS="gfx90a;gfx942;gfx950"
+RUN set -x ; \
+    sudo mkdir /home/jenkins && \
+    sudo mkdir /home/jenkins/workspace && \
+    cd /home/jenkins/workspace && rm -rf rocm-libraries ck && \
+    if [ "$CK_FROM_ROCM_LIBRARIES" = "1" ]; then \
+        git clone --depth 1 -b "$CK_FA_BRANCH" --no-checkout --filter=blob:none https://github.com/$CK_FA_ORIGIN/rocm-libraries.git && \
+        cd rocm-libraries && \
+        git sparse-checkout init --cone && \
+        git sparse-checkout set projects/composablekernel && \
+        git checkout "$CK_FA_BRANCH" && \
+        ROCM_LIBRARIES_SHA=$(git rev-parse --short HEAD) && \
+        mv projects/composablekernel ../ck && \
+        cd ../ck && rm -rf ../rocm-libraries && \
+        git init && \
+        git config user.name "assistant-librarian[bot]" && \
+        git config user.email "assistant-librarian[bot]@users.noreply.github.com" && \
+        git branch -m "$CK_FA_BRANCH" && git add -A && \
+        git commit -m "import from ROCm/rocm-libraries@$ROCM_LIBRARIES_SHA" > /dev/null ; \
+    else \
+        git clone --depth 1 -b "$CK_FA_BRANCH" https://github.com/$CK_FA_ORIGIN/composable_kernel.git ck ; \
+    fi && \
+    cd /home/jenkins/workspace && rm -rf flash-attention && \
+    git clone --depth 1 -b "$FA_BRANCH" --recursive "https://github.com/$FA_ORIGIN/flash-attention.git" && \
+    cd flash-attention && \
+    rm -rf csrc/composable_kernel/ && \
+    git clone -b "$CK_FA_BRANCH" ../ck csrc/composable_kernel/ && git add csrc/composable_kernel && \
+    MAX_JOBS=$(nproc) GPU_ARCHS="$GPU_ARCHS" /opt/venv/bin/python3 -u -m pip install --no-build-isolation -v . && \
+    groupadd -g 1001 jenkins && \
+    useradd -u 1001 -g 1001 -m -s /bin/bash jenkins && \
+    chown -R jenkins:jenkins /home/jenkins && \
+    chmod -R a+rwx /home/jenkins && \
+    chown -R jenkins:jenkins /tmp && \
+    chmod -R a+rwx /tmp && \
+    sudo usermod -aG irc jenkins
diff --git a/Dockerfile.manylinux b/Dockerfile.manylinux
index 2c0bec2840..bfbe847b1d 100644
--- a/Dockerfile.manylinux
+++ b/Dockerfile.manylinux
@@ -3,7 +3,6 @@ ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=7.2
 ARG compiler_version=""
 ARG compiler_commit=""
-ARG CK_SCCACHE=""
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 ENV DEBIAN_FRONTEND=noninteractive
@@ -19,16 +18,15 @@ RUN wget https://repo.radeon.com/amdgpu-install/7.2/rhel/8.10/amdgpu-install-7.2
     dnf install python3-setuptools python3-wheel -y && \
     dnf install rocm-dev -y
 
-## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
-ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
+# Install SCCACHE
+ENV SCCACHE_VERSION="0.14.0"
 ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
 ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
-ENV CK_SCCACHE=$CK_SCCACHE
-RUN if [ "$CK_SCCACHE" != "" ]; then \
-        mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
-        curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
-        chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache; \
-    fi
+RUN set -x && \
+    mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
+    wget -qO sccache.tar.gz https://github.com/mozilla/sccache/releases/latest/download/sccache-v$SCCACHE_VERSION-x86_64-unknown-linux-musl.tar.gz && \
+    tar -xzf sccache.tar.gz --strip-components=1 -C ${SCCACHE_INSTALL_LOCATION} && \
+    chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
 
 # Install dependencies
 RUN dnf update -y && DEBIAN_FRONTEND=noninteractive dnf install -y \
@@ -83,19 +81,71 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'" && \
     sh -c "echo compiler commit = '$compiler_commit'"
 
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
+RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] ) && [ "$compiler_commit" = "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
         cd llvm-project && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 8 ; \
+        cmake -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang;lld;clang-tools-extra;flang;mlir" \
+        -DLLVM_LIT_ARGS="-vv --show-unsupported --show-xfail -j 32" \
+        -DPACKAGE_VENDOR="AMD" \
+        -DCMAKE_INSTALL_PREFIX=/home/$USER/rocm/pure_llvm_1.0 \
+        -DLLVM_ENABLE_ASSERTIONS=ON \
+        -DLLVM_BUILD_DOCS=ON \
+        -DLLVM_TARGETS_TO_BUILD=all \
+        -DLIBOMPTARGET_ENABLE_DEBUG=ON \
+        -DOFFLOAD_ENABLE_EMISSARY_APIS=OFF \
+        -DCLANG_DEFAULT_LINKER=lld \
+        -DCLANG_DEFAULT_PIE_ON_LINUX=0 \
+        -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;openmp;compiler-rt;libunwind;flang-rt" \
+        -DLIBCXX_ENABLE_SHARED=OFF \
+        -DLIBCXX_ENABLE_STATIC=ON \
+        -DLIBCXX_INSTALL_LIBRARY=OFF \
+        -DLIBCXX_INSTALL_HEADERS=OFF \
+        -DLIBCXXABI_ENABLE_SHARED=OFF \
+        -DLIBCXXABI_ENABLE_STATIC=ON \
+        -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF \
+        -DLLVM_ENABLE_ASSERTIONS=1 \
+        -DLLVM_ENABLE_Z3_SOLVER=OFF \
+        -DLLVM_ENABLE_ZLIB=ON \
+        -DLLVM_LINK_LLVM_DYLIB=OFF \
+        -DCLANG_LINK_CLANG_DYLIB=OFF \
+        ../llvm && \
+        ninja -j16 ; \
     else echo "using the release compiler"; \
     fi
 
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
+RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] ) && [ "$compiler_commit" != "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
         cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 8 ; \
+        cmake -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang;lld;clang-tools-extra;flang;mlir" \
+        -DLLVM_LIT_ARGS="-vv --show-unsupported --show-xfail -j 32" \
+        -DPACKAGE_VENDOR="AMD" \
+        -DCMAKE_INSTALL_PREFIX=/home/$USER/rocm/pure_llvm_1.0 \
+        -DLLVM_ENABLE_ASSERTIONS=ON \
+        -DLLVM_BUILD_DOCS=ON \
+        -DLLVM_TARGETS_TO_BUILD=all \
+        -DLIBOMPTARGET_ENABLE_DEBUG=ON \
+        -DOFFLOAD_ENABLE_EMISSARY_APIS=OFF \
+        -DCLANG_DEFAULT_LINKER=lld \
+        -DCLANG_DEFAULT_PIE_ON_LINUX=0 \
+        -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;openmp;compiler-rt;libunwind;flang-rt" \
+        -DLIBCXX_ENABLE_SHARED=OFF \
+        -DLIBCXX_ENABLE_STATIC=ON \
+        -DLIBCXX_INSTALL_LIBRARY=OFF \
+        -DLIBCXX_INSTALL_HEADERS=OFF \
+        -DLIBCXXABI_ENABLE_SHARED=OFF \
+        -DLIBCXXABI_ENABLE_STATIC=ON \
+        -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF \
+        -DLLVM_ENABLE_ASSERTIONS=1 \
+        -DLLVM_ENABLE_Z3_SOLVER=OFF \
+        -DLLVM_ENABLE_ZLIB=ON \
+        -DLLVM_LINK_LLVM_DYLIB=OFF \
+        -DCLANG_LINK_CLANG_DYLIB=OFF \
+        ../llvm && \
+        ninja -j16 ; \
     else echo "using the release compiler"; \
     fi
 
diff --git a/Jenkinsfile b/Jenkinsfile
index 22709f414a..42ca1756c0 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -81,71 +81,6 @@ def checkoutComposableKernel()
     checkout scm
 }
 
-// Given a pattern, check if the log contains the pattern and return the context.
-def checkForPattern(pattern, log) {
-    def lines = log.split('\n')
-    for (int i = 0; i < lines.size(); i++) {
-        if (lines[i] =~ pattern) {
-            echo "Found pattern match in log for ${pattern}"
-            
-            // Get the two lines before and after failure.
-            def contextStart = Math.max(0, i - 2)
-            def contextEnd = Math.min(lines.size() - 1, i + 2)
-            def contextLines = []
-            for (int j = contextStart; j <= contextEnd; j++) {
-                contextLines.add(lines[j])
-            }
-            
-            return [found: true, matchedLine: lines[i], context: contextLines.join('\n')]
-        }
-    }
-    echo "No pattern match found in log for ${pattern}"
-    return [found: false, matchedLine: "", context: ""]
-}
-
-// Scan the build logs for failures and send notifications.
-def sendFailureNotifications() {
-    // Error patterns to scan build logs for specific failure types and send detailed notifications.
-    def failurePatterns = [
-        [pattern: /login attempt to .* failed with status: 401 Unauthorized/, description: "Docker registry authentication failed"],
-        [pattern: /.*docker login failed.*/, description: "Docker login failed"],
-        [pattern: /HTTP request sent .* 404 Not Found/, description: "HTTP request failed with 404"],
-        [pattern: /cat: .* No such file or directory/, description: "GPU not found"],
-        [pattern: /.*GPU not found.*/, description: "GPU not found"],
-        [pattern: /Could not connect to Redis at .* Connection timed out/, description: "Redis connection timed out"],
-        [pattern: /.*unauthorized: your account must log in with a Personal Access Token.*/, description: "Docker login failed"],
-        [pattern: /.*sccache: error: Server startup failed: Address in use.*/, description: "Sccache Error"]
-    ]
-    
-    // Get the build log.
-    def buildLog = sh(script: 'wget -q --no-check-certificate -O - ' + BUILD_URL + 'consoleText', returnStdout: true)
-    echo "Checking for failure patterns..."
-    // Check for patterns in the log.
-    // def foundPatterns = []
-    // for (patternMap in failurePatterns) {
-    //     def result = checkForPattern(patternMap.pattern, buildLog)
-    //     if (result.found) {
-    //         foundPatterns.add([
-    //             description: patternMap.description,
-    //             matchedLine: result.matchedLine,
-    //             context: result.context
-    //         ])
-    //     }
-    // }
-    echo "Done checking for failure patterns..."
-    // Send a notification for each matched failure pattern.
-    for (patternMap in foundPatterns) {
-        withCredentials([string(credentialsId: 'ck_ci_errors_webhook_url', variable: 'WEBHOOK_URL')]) {
-        sh '''
-            curl -X POST "${WEBHOOK_URL}" \
-            -H 'Content-Type: application/json' \
-            -d '{"text": "\\n\\n**Build Failed**\\n\\n**Issues detected:** ''' + patternMap.description + '''\\n\\n**Log context:**\\n```\\n''' + patternMap.context.replace("'", "\\'") + '''\\n```\\n\\n**Job:** ''' + env.JOB_NAME + '''\\n\\n**Build:** #''' + env.BUILD_NUMBER + '''\\n\\n**URL:** ''' + env.RUN_DISPLAY_URL + '''"}'
-        '''
-        }
-    }
-    echo "Done failure pattern checking and notifications"
-}
-
 def generateAndArchiveBuildTraceVisualization(String buildTraceFileName) {
     try {
         checkoutComposableKernel()
@@ -479,51 +414,86 @@ def getDockerImage(Map conf=[:]){
     return [retimage, image]
 }
 
-def buildDocker(install_prefix){
+// Build and push a docker image, capturing its digest into the specified env var.
+// If forceBuild is false, will skip building if the image already exists in the registry.
+def buildAndPushDockerImage(String install_prefix, String image_name, String dockerExtraArgs, boolean forceBuild){
     show_node_info()
     env.DOCKER_BUILDKIT=1
     checkoutComposableKernel()
-    def image_name = getDockerImageName()
-    def base_image_name = getBaseDockerImageName()
-    echo "Building Docker for ${image_name}"
-    def dockerArgs = "--build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    if(params.COMPILER_VERSION == "develop" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
-        dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f projects/composablekernel/Dockerfile.compiler . "
-    }
-    else if(params.RUN_AITER_TESTS){
-        image_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
-        dockerArgs = dockerArgs + " --no-cache -f projects/composablekernel/Dockerfile.aiter --build-arg AITER_BRANCH='${params.aiter_branch}' --build-arg CK_AITER_BRANCH='${params.ck_aiter_branch}' . "
-    }
-     else if(params.RUN_PYTORCH_TESTS){
-        image_name = "${env.CK_DOCKERHUB}:ck_pytorch"
-        dockerArgs = dockerArgs + " --no-cache -f projects/composablekernel/Dockerfile.pytorch --build-arg CK_PYTORCH_BRANCH='${params.ck_pytorch_branch}' . "
-    }
-   else{
-        dockerArgs = dockerArgs + " -f projects/composablekernel/Dockerfile . "
-    }
-    echo "Build Args: ${dockerArgs}"
-    try{
-        if(params.BUILD_DOCKER || params.RUN_AITER_TESTS || params.RUN_PYTORCH_TESTS){
-            //force building the new docker if that parameter is true
-            echo "Building image: ${image_name}"
-            retimage = docker.build("${image_name}", dockerArgs)
-            withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
-                retimage.push()
-            }
-            sh 'docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi'
-        }
-        else{
+    def dockerArgs = "--build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+    dockerArgs += " " + dockerExtraArgs
+
+    if(!forceBuild){
+        try{
             echo "Checking for image: ${image_name}"
             sh "docker manifest inspect --insecure ${image_name}"
             echo "Image: ${image_name} found! Skipping building image"
+            return image_name
+        }
+        catch(Exception ex){
+            echo "Unable to locate image: ${image_name}. Will attempt to build image now."
         }
     }
-    catch(Exception ex){
-        echo "Unable to locate image: ${image_name}. Building image now"
-        retimage = docker.build("${image_name}", dockerArgs)
-        withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
-            retimage.push()
-        }
+
+    echo "Building image: ${image_name} with args: ${dockerArgs}"
+    def retimage = docker.build("${image_name}", dockerArgs)
+    withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
+        retimage.push()
+    }
+    def digest = sh(returnStdout: true, script: "docker inspect --format='{{index .RepoDigests 0}}' ${image_name}").trim()
+    echo "Built image digest: ${digest}"
+    echo "Pruning dangling Docker images to free disk space on CI agent"
+    sh "docker image prune -f --filter 'dangling=true' || true"
+    return digest
+}
+
+def buildDockerBase(install_prefix){
+    def image_name = getDockerImageName()
+    def base_image_name = getBaseDockerImageName()
+    echo "Building Docker for ${image_name}"
+    def dockerExtraArgs = " -f projects/composablekernel/Dockerfile . "
+    if(params.COMPILER_VERSION == "develop" || params.COMPILER_VERSION == "amd-staging" || params.COMPILER_COMMIT != ""){
+        dockerExtraArgs = " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f projects/composablekernel/Dockerfile.compiler . "
+    }
+    else if(params.COMPILER_VERSION == "therock"){
+        dockerExtraArgs = " --no-cache -f projects/composablekernel/Dockerfile . "
+    }
+    env.CK_BASE_IMAGE = buildAndPushDockerImage(install_prefix, image_name, dockerExtraArgs, params.BUILD_DOCKER.toBoolean())
+}
+
+def buildDockerPytorch(install_prefix){
+    def image_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_pytorch"
+    def dockerExtraArgs = " --no-cache -f projects/composablekernel/Dockerfile.pytorch --build-arg CK_PYTORCH_BRANCH='${params.ck_pytorch_branch}' . "
+    env.CK_PYTORCH_IMAGE = buildAndPushDockerImage(install_prefix, image_name, dockerExtraArgs, true)
+}
+
+def buildDockerAiter(install_prefix){
+    def image_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
+    def dockerExtraArgs = " --no-cache -f projects/composablekernel/Dockerfile.aiter --build-arg AITER_BRANCH='${params.aiter_branch}' --build-arg CK_AITER_BRANCH='${params.ck_aiter_branch}' . "
+    env.CK_AITER_IMAGE = buildAndPushDockerImage(install_prefix, image_name, dockerExtraArgs, true)
+}
+
+def buildDockerFa(install_prefix){
+    def image_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_fa"
+    def dockerExtraArgs = " --no-cache -f projects/composablekernel/Dockerfile.fa"
+    dockerExtraArgs += " --build-arg BASE_DOCKER='${params.fa_base_docker}'"
+    dockerExtraArgs += " --build-arg FA_BRANCH='${params.fa_branch}'"
+    dockerExtraArgs += " --build-arg CK_FA_BRANCH='${params.ck_fa_branch}'"
+    dockerExtraArgs += " --build-arg GPU_ARCHS='gfx942;gfx950'"
+    dockerExtraArgs += " . "
+    env.CK_FA_IMAGE = buildAndPushDockerImage(install_prefix, image_name, dockerExtraArgs, true)
+}
+
+def buildDocker(install_prefix){
+    buildDockerBase(install_prefix)
+    if (params.RUN_PYTORCH_TESTS.toBoolean()) {
+        buildDockerPytorch(install_prefix)
+    }
+    if (params.RUN_AITER_TESTS.toBoolean()) {
+        buildDockerAiter(install_prefix)
+    }
+    if (params.RUN_FA_TESTS.toBoolean()) {
+        buildDockerFa(install_prefix)
     }
 }
 
@@ -535,10 +505,10 @@ def get_docker_options(){
     else{ //only add kfd and dri paths if you actually going to run somthing on GPUs
         dockerOpts = "--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
     }
-    if (params.COMPILER_VERSION == "develop" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
+    if (params.COMPILER_VERSION == "develop" || params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "therock" || params.COMPILER_COMMIT != ""){
     // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
     // newer clang22 compilers and running with older hip runtima libraries
-        dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 "
+        dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 --env HIP_PLATFORM=amd "
     }
     // on some machines the group ids for video and render groups may not be the same as in the docker image!
     def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
@@ -1148,99 +1118,73 @@ def process_results(Map conf=[:]){
     }
 }
 
-def run_aiter_tests(Map conf=[:]){
+def run_downstream_tests(Map conf=[:]){
     show_node_info()
     checkoutComposableKernel()
-    //use the latest pytorch image
-    def image = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
-    def dockerOpts=get_docker_options() + ' --group-add irc '
+    def dockerOpts = get_docker_options() + ' --group-add irc '
 
     gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'rocm-libraries') {
         try
         {
-            echo "Pulling image: ${image}"
-            retimage = docker.image("${image}")
+            echo "Pulling image: ${conf.image}"
+            retimage = docker.image("${conf.image}")
             withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
                 retimage.pull()
             }
         }
         catch(Exception ex)
         {
-            error "Unable to locate image: ${image}"
+            error "Unable to locate image: ${conf.image}"
         }
     }
 
-    withDockerContainer(image: image, args: dockerOpts) {
-        timeout(time: 5, unit: 'HOURS'){
+    withDockerContainer(image: conf.image, args: dockerOpts) {
+        timeout(time: conf.get("timeoutHours", 2), unit: 'HOURS'){
             try{
                 sh "rocminfo"
                 sh "python3 --version"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_batch_prefill.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_2stage.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_blockscale.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_ep.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_sorting.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_sorting_mxfp4.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_tkw1.py"
+                for (cmd in conf.execute_cmds) {
+                    sh "${cmd}"
+                }
             }
             catch(e){
-                echo "Throwing error exception while running AITER tests"
+                echo "Throwing error exception while running ${env.STAGE_NAME}"
                 echo 'Exception occurred: ' + e.toString()
                 throw e
             }
             finally{
-                echo "Finished running AITER tests"
+                echo "Finished running ${env.STAGE_NAME}"
             }
         }
     }
 }
 
-
-def run_pytorch_tests(Map conf=[:]){
-    show_node_info()
-    checkoutComposableKernel()
-    //use the latest pytorch-nightly image
-    def image = "${env.CK_DOCKERHUB}:ck_pytorch"
-    def dockerOpts=get_docker_options() + ' --group-add irc '
-
-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'rocm-libraries') {
-        try
-        {
-            echo "Pulling image: ${image}"
-            retimage = docker.image("${image}")
-            withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
-                retimage.pull()
-            }
-        }
-        catch(Exception ex)
-        {
-            error "Unable to locate image: ${image}"
-        }
-    }
-
-    withDockerContainer(image: image, args: dockerOpts) {
-        timeout(time: 2, unit: 'HOURS'){
-            try{
-                sh "rocminfo"
-                sh "python3 --version"
-                sh "python3 /tmp/pytorch/tools/amd_build/build_amd.py"
-                sh "USE_ROCM_CK_SDPA=1 PYTORCH_ROCM_ARCH=gfx942 python /tmp/pytorch/setup.py develop"
-            }
-            catch(e){
-                echo "Throwing error exception while building Pytorch"
-                echo 'Exception occurred: ' + e.toString()
-                throw e
-            }
-            finally{
-                echo "Finished building Pytorch"
-            }
-        }
-    }
+def getPytorchTestsCmds() {
+    return [
+        "python3 /tmp/pytorch/tools/amd_build/build_amd.py",
+        "USE_ROCM_CK_SDPA=1 PYTORCH_ROCM_ARCH=gfx942 python /tmp/pytorch/setup.py develop"
+    ]
+}
+def getAiterTestsCmds() {
+    return [
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_batch_prefill.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_2stage.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_blockscale.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_ep.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_sorting.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_sorting_mxfp4.py",
+        "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_tkw1.py"
+    ]
+}
+def getFaTestsCmds() {
+    return [
+        "python3 -u -m pytest /home/jenkins/workspace/flash-attention/tests/test_flash_attn_ck.py"
+    ]
 }
 
 //launch develop branch daily jobs
@@ -1248,15 +1192,20 @@ CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_
                                               0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_BASIC_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX101=false;BUILD_GFX908=false;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true;BUILD_PACKAGES=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=develop;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
-                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
-                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;FORCE_CI=true
-                                              0 13 * * * % RUN_FULL_CONV_TILE_TESTS=true;RUN_AITER_TESTS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;FORCE_CI=true
-                                              0 11 * * * % RUN_PYTORCH_TESTS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;BUILD_GFX101=false;BUILD_GFX103=false;BUILD_GFX11=false;BUILD_GFX12=false;BUILD_GFX90A=false;FORCE_CI=true''' : ""
+                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=therock;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
+                                              0 15 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
+                                              0 13 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;FORCE_CI=true
+                                              0 11 * * * % RUN_FULL_CONV_TILE_TESTS=true;RUN_AITER_TESTS=true;RUN_FA_TESTS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;FORCE_CI=true
+                                              0 9 * * * % RUN_PYTORCH_TESTS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;BUILD_GFX101=false;BUILD_GFX103=false;BUILD_GFX11=false;BUILD_GFX12=false;BUILD_GFX90A=false;FORCE_CI=true''' : ""
+CURRENT_BRANCH_NAME = env.CHANGE_BRANCH ? env.CHANGE_BRANCH : env.BRANCH_NAME
+
+POLL_SPEC = BRANCH_NAME == "develop" ? 'H H/6 * * *' : ''
 
 pipeline {
     agent none
     triggers {
         parameterizedCron(CRON_SETTINGS)
+        pollSCM(POLL_SPEC)
     }
     options {
         skipDefaultCheckout()
@@ -1278,7 +1227,7 @@ pipeline {
         string(
             name: 'COMPILER_VERSION',
             defaultValue: '',
-            description: 'Specify which version of compiler to use: release, develop, amd-mainline, or leave blank (default).')
+            description: 'Specify which version of compiler to use: develop, amd-staging, therock, or leave blank (default).')
         string(
             name: 'COMPILER_COMMIT',
             defaultValue: '',
@@ -1381,8 +1330,8 @@ pipeline {
             description: "Build CK and run tests on gfx12 (default: ON)")
         booleanParam(
             name: "NINJA_BUILD_TRACE",
-            defaultValue: false,
-            description: "Generate a ninja build trace (default: OFF)")
+            defaultValue: true,
+            description: "Generate a ninja build trace (default: ON)")
         booleanParam(
             name: "NINJA_FTIME_TRACE",
             defaultValue: false,
@@ -1409,8 +1358,8 @@ pipeline {
             description: "Try building PYTORCH with latest CK develop branch (default: OFF)")
         string(
             name: 'ck_pytorch_branch',
-            defaultValue: 'develop',
-            description: 'Specify which branch of CK to test with Pytorch (default: develop)')
+            defaultValue: CURRENT_BRANCH_NAME,
+            description: 'Specify which branch of CK to test with Pytorch (default: current branch)')
         booleanParam(
             name: "RUN_AITER_TESTS",
             defaultValue: false,
@@ -1425,8 +1374,24 @@ pipeline {
             description: 'Specify which branch of AITER to use (default: main)')
         string(
             name: 'ck_aiter_branch',
-            defaultValue: 'develop',
-            description: 'Specify which branch of CK to test with AITER (default: develop)')
+            defaultValue: CURRENT_BRANCH_NAME,
+            description: 'Specify which branch of CK to test with AITER (default: current branch)')
+        booleanParam(
+            name: "RUN_FA_TESTS",
+            defaultValue: false,
+            description: "Run Flash Attention tests with latest CK develop branch (default: OFF)")
+        string(
+            name: 'fa_base_docker',
+            defaultValue: 'rocm/pytorch:rocm7.1.1_ubuntu24.04_py3.12_pytorch_release_2.9.1',
+            description: 'Specify which base docker image to use for flash-attention tests')
+        string(
+            name: 'fa_branch',
+            defaultValue: 'ck_improve_main',
+            description: 'Specify which branch of flash-attention to use (default: ck_improve_main)')
+        string(
+            name: 'ck_fa_branch',
+            defaultValue: CURRENT_BRANCH_NAME,
+            description: 'Specify which branch of CK to test with flash-attention (default: current branch)')
         booleanParam(
             name: "FORCE_CI",
             defaultValue: false,
@@ -1519,7 +1484,7 @@ pipeline {
                 }
             }
         }
-         stage("Run Pytorch Tests")
+         stage("Run Downstream Tests")
         {
             when {
                 beforeAgent true
@@ -1535,20 +1500,10 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx942")}
                     steps{
-                        run_pytorch_tests()
+                        run_downstream_tests(image: "${env.CK_PYTORCH_IMAGE}", timeoutHours: 2, execute_cmds: getPytorchTestsCmds())
                         cleanWs()
                     }
                 }
-            }
-        }
-        stage("Run AITER Tests")
-        {
-            when {
-                beforeAgent true
-                expression { env.SHOULD_RUN_CI.toBoolean() }
-            }
-            parallel
-            {
                 stage("Run AITER Tests on gfx942")
                 {
                     when {
@@ -1557,7 +1512,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx942")}
                     steps{
-                        run_aiter_tests()
+                        run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: getAiterTestsCmds())
                         cleanWs()
                     }
                 }
@@ -1569,7 +1524,31 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx950")}
                     steps{
-                        run_aiter_tests()
+                        run_downstream_tests(image: "${env.CK_AITER_IMAGE}", timeoutHours: 5, execute_cmds: getAiterTestsCmds())
+                        cleanWs()
+                    }
+                }
+                stage("Run FA Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FA_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942")}
+                    steps{
+                        run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: getFaTestsCmds())
+                        cleanWs()
+                    }
+                }
+                stage("Run FA Tests on gfx950")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FA_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx950")}
+                    steps{
+                        run_downstream_tests(image: "${env.CK_FA_IMAGE}", timeoutHours: 5, execute_cmds: getFaTestsCmds())
                         cleanWs()
                     }
                 }
@@ -2109,7 +2088,10 @@ pipeline {
                          description: 'Some checks have failed'
             node(rocmnode("nogpu")) {
                 script {
-                    sendFailureNotifications()
+                    checkoutComposableKernel()
+                }
+                withCredentials([string(credentialsId: 'ck_ci_errors_webhook_url', variable: 'WEBHOOK_URL')]) {
+                    sh 'bash projects/composablekernel/script/infra_helper/send_failure_notifications.sh'
                 }
             }
         }
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
index 863501cd0a..9895ed7e54 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
index e748a29743..617c2318d5 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
index a68fb53cba..84516b2577 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
index 0262319c39..3490c38f6a 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));
diff --git a/dispatcher/README.md b/dispatcher/README.md
index d1ca299d78..dc864f7c62 100644
--- a/dispatcher/README.md
+++ b/dispatcher/README.md
@@ -1,6 +1,6 @@
 # CK Tile Dispatcher
 
-A unified kernel dispatch system for AMD GPUs with C++ and Python frontends.
+A unified kernel dispatch system for AMD GPUs with C++ and Python frontends, supporting GEMM and Grouped Convolution operations.
 
 **Validated Platform:** AMD Instinct MI300 series (gfx942)
 
@@ -154,6 +154,8 @@ rocminfo | grep -i "gfx"
 
 ### Install Python Dependencies
 
+#### Core Dependencies (Required)
+
 NumPy is required for Python examples and kernel generation. We recommend using a virtual environment:
 
 **Option 1: Using standard venv**
@@ -165,8 +167,8 @@ python3 -m venv .venv
 source .venv/bin/activate  # Linux/macOS
 # .venv\Scripts\activate   # Windows
 
-# Install NumPy
-pip install numpy
+# Install core dependencies
+pip install -r python/requirements.txt
 ```
 
 **Option 2: Using uv (faster alternative)**
@@ -179,17 +181,38 @@ uv venv .venv
 source .venv/bin/activate  # Linux/macOS
 # .venv\Scripts\activate   # Windows
 
-# Install NumPy
-uv pip install numpy
+# Install core dependencies
+uv pip install -r python/requirements.txt
 ```
 
 **Option 3: System-wide install (not recommended)**
 ```bash
-pip install numpy
+pip install -r python/requirements.txt
 ```
 
 > **Note:** Always activate your virtual environment before running CMake or Python examples.
 
+#### ML Heuristics Dependencies (Optional)
+
+For ML-based kernel selection (examples 09-11), install additional dependencies:
+
+```bash
+# Activate your virtual environment first
+source .venv/bin/activate
+
+# Install ML dependencies (LightGBM, pandas, pyarrow, scikit-learn)
+pip install -r requirements-ml.txt
+```
+
+**Why separate?** ML dependencies are large (especially pyarrow) and not needed for basic dispatcher usage. Install only if you need:
+- ML-based kernel selection (`examples/gemm/python/09_ml_heuristic.py`)
+- Model training (`heuristics/train.py`)
+- Model evaluation (`heuristics/evaluate.py`)
+- Automated benchmark analysis
+
+**Core dependencies:** ~50 MB (NumPy only)
+**With ML dependencies:** ~500 MB (includes LightGBM, pandas, pyarrow, scikit-learn)
+
 ### Supported Data Types
 
 CK Tile supports a wide range of data types for GEMM operations:
@@ -319,8 +342,8 @@ ls examples/libdispatcher_gemm_lib.so
 | `CMAKE_PREFIX_PATH` | - | ROCm installation path |
 | `CMAKE_CXX_COMPILER` | - | Path to hipcc compiler |
 
-⚠️ **Important:** Always use `-DCMAKE_BUILD_TYPE=Release` for benchmarking. Debug builds are slower.
-⚠️ **Important:** Note that the current system provides single GPU target support for architecture-based kernel filtering, please do not use multiple GPU targets at a time (if necessary, please compile into different build directories).
+WARNING: **Important:** Always use `-DCMAKE_BUILD_TYPE=Release` for benchmarking. Debug builds are slower.
+WARNING: **Important:** Note that the current system provides single GPU target support for architecture-based kernel filtering, please do not use multiple GPU targets at a time (if necessary, please compile into different build directories).
 
 ---
 
@@ -340,6 +363,15 @@ cd build/examples
 ./gemm_04_heuristics         # Heuristic kernel selection
 ./gemm_05_json_export        # Registry JSON export
 ./gemm_06_multi_registry     # Multiple registries
+
+# Grouped Convolution Examples
+./grouped_conv_01_basic         # Declaration patterns + GPU execution
+./grouped_conv_02_all_dirs      # Forward/BwdData/BwdWeight with GPU
+./grouped_conv_03_bench_val     # Benchmark + CPU reference validation
+./grouped_conv_04_registry_json # Heuristic selection + JSON export
+./grouped_conv_05_bwd_data      # Backward data + CPU validation
+./grouped_conv_06_bwd_weight    # Backward weight + CPU validation
+./grouped_conv_07_benchmark     # Multi-tile ResNet benchmark
 ```
 
 ### Python Examples
@@ -352,8 +384,16 @@ cd /path/to/composable_kernel/dispatcher
 # GEMM Examples
 python3 examples/gemm/python/01_basic_gemm.py     # Basic multi-kernel GEMM
 python3 examples/gemm/python/04_validation.py     # CPU reference validation
-python3 examples/gemm/python/07_stress_test.py    # Stress test (48 kernels)
+python3 examples/gemm/python/07_stress_test.py    # Stress test
 python3 examples/gemm/python/08_heuristics.py     # Heuristic selection
+
+# Grouped Convolution Examples
+python3 examples/grouped_conv/python/01_basic_grouped_conv.py  # Config patterns + registry + GPU
+python3 examples/grouped_conv/python/02_forward.py             # Forward 2D/3D + CPU ref
+python3 examples/grouped_conv/python/03_bwd_data.py            # Backward data + CPU ref
+python3 examples/grouped_conv/python/04_bwd_weight.py          # Backward weight + CPU ref
+python3 examples/grouped_conv/python/05_benchmark.py           # Multi-problem benchmark
+python3 examples/grouped_conv/python/06_registry_json.py       # Heuristic selection + JSON
 ```
 
 ### Example Output
@@ -470,6 +510,42 @@ python3 examples/gemm/python/10_advanced_benchmark.py \
 
 ---
 
+## ML-Based Kernel Selection (Optional)
+
+The dispatcher includes ML heuristics for automated kernel selection using trained LightGBM models.
+
+**Prerequisites:** Install ML dependencies first:
+
+```bash
+pip install -r requirements-ml.txt  # ~500 MB (LightGBM, pandas, pyarrow, scikit-learn)
+```
+
+**Documentation:** See [heuristics/README.md](heuristics/README.md) for:
+- Training and evaluating models
+- Feature engineering (72 features)
+- Using pre-trained models
+- Python API reference
+
+**Examples:**
+```bash
+python3 examples/gemm/python/09_ml_heuristic.py      # ML-based kernel selection
+python3 examples/gemm/python/10_rank_kernels.py      # Kernel ranking
+```
+
+**Model Compression:** Trained models are stored in compressed `.lgbm.gz` format to save space (~67% size reduction). Python tools automatically decompress models on first use. For C++ examples, decompress manually:
+
+```bash
+# If you have compressed models
+cd heuristics/models/gemm_universal_fp16_gfx950
+gunzip model_tflops.lgbm.gz
+
+# Then use in C++ example
+cd ../../../build
+./gemm_09_ml_heuristic --model ../heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm
+```
+
+---
+
 ## External Integration
 
 ### Using Dispatcher in Your Own Project
@@ -588,7 +664,7 @@ lib = DispatcherLib.load("/absolute/path/to/libdispatcher_gemm_lib.so")
 ### Data Flow
 
 ```
-KernelConfig → Registry → Dispatcher → GPU Execution
+KernelConfig -> Registry -> Dispatcher -> GPU Execution
 ```
 
 1. **KernelConfig**: Defines kernel parameters (tile sizes, data types, layouts)
@@ -784,31 +860,49 @@ make -j$(nproc)
 
 ```
 dispatcher/
-├── README.md                    # This file
-├── CMakeLists.txt              # Build configuration
-│
-├── include/ck_tile/dispatcher/  # C++ headers
-│   ├── dispatcher.hpp           # GEMM dispatcher
-│   ├── registry.hpp             # Kernel registry
-│   └── kernel_key.hpp          # Kernel configuration
-│
-├── src/                        # C++ implementation
-│
-├── codegen/                    # Kernel generation
-│   ├── unified_gemm_codegen.py # GEMM kernel generator
-│   └── arch_specs.json         # GPU specifications
-│
-├── bindings/ctypes/            # Python ctypes interface
-│   └── gemm_ctypes_lib.cpp     # GEMM Python library
-│
-├── examples/                   # Examples
-│   └── gemm/
-│       ├── cpp/                # C++ GEMM examples (01-06)
-│       └── python/             # Python GEMM examples (01-11)
-│
-├── scripts/                    # Build scripts
-│
-└── tests/                      # Unit tests
+|---- README.md                    # This file
+|---- CMakeLists.txt              # Build configuration
+|
+|---- include/ck_tile/dispatcher/  # C++ headers
+|   |---- dispatcher.hpp           # Main dispatcher include
+|   |---- registry.hpp             # GEMM kernel registry
+|   |---- kernel_key.hpp           # Kernel configuration
+|   |---- grouped_conv_config.hpp  # Grouped conv configuration
+|   |---- grouped_conv_problem.hpp # Grouped conv problem (with builder)
+|   |---- grouped_conv_kernel_decl.hpp  # Grouped conv kernel declarations
+|   |---- grouped_conv_registry.hpp     # Grouped conv registry (thread-safe)
+|   +---- grouped_conv_utils.hpp        # Grouped conv utilities
+|
+|---- src/                        # C++ implementation
+|
+|---- codegen/                    # Kernel generation
+|   |---- codegen_common.py       # Shared: TileConfig, TraitConfigBase, type mappings
+|   |---- unified_gemm_codegen.py # GEMM kernel generator
+|   |---- unified_grouped_conv_codegen.py  # Grouped conv kernel generator
+|   +---- arch_specs.json         # GPU specifications
+|
+|---- python/                     # Python utilities
+|   |---- dispatcher_common.py    # Shared: paths, validation, Colors, phased output
+|   |---- ctypes_utils.py         # GEMM ctypes utilities
+|   +---- grouped_conv_utils.py   # Grouped conv utilities
+|
+|---- scripts/                    # Build scripts
+|   |---- compile_gemm_examples.py           # GEMM build script
+|   +---- compile_grouped_conv_examples.py   # Grouped conv build script
+|
+|---- bindings/ctypes/            # Python ctypes interface
+|   |---- gemm_ctypes_lib.cpp     # GEMM Python library
+|   +---- conv_ctypes_lib.cpp     # Grouped conv Python library
+|
+|---- examples/                   # Examples
+|   |---- gemm/
+|   |   |---- cpp/                # C++ GEMM examples (01-07)
+|   |   +---- python/             # Python GEMM examples (01-11)
+|   +---- grouped_conv/
+|       |---- cpp/                # C++ Grouped Conv examples (01-07)
+|       +---- python/             # Python Grouped Conv examples (01-06)
+|
++---- tests/                      # Unit tests (C++ and Python)
 ```
 
 ---
@@ -820,17 +914,49 @@ dispatcher/
 | GEMM C++ | [examples/gemm/cpp/README.md](examples/gemm/cpp/README.md) |
 | GEMM Python | [examples/gemm/python/README.md](examples/gemm/python/README.md) |
 | Codegen | [codegen/README.md](codegen/README.md) |
+| Python Utils | [python/README.md](python/README.md) |
+| C++ Headers | [include/ck_tile/dispatcher/README.md](include/ck_tile/dispatcher/README.md) |
 
 ---
 
-## Archived Content
+## Grouped Convolution Support
 
-Convolution examples and utilities have been archived to `ck-2/conv_archive/dispatcher/`:
-- `examples/conv/cpp/` - 11 C++ convolution examples
-- `examples/conv/python/` - 14 Python convolution examples
-- `codegen/unified_conv_codegen.py` - Conv kernel generator
-- `include/ck_tile/dispatcher/conv_*.hpp` - Conv headers
-- `python/conv_utils.py` - Conv Python utilities
+Grouped convolution is fully supported alongside GEMM, with shared infrastructure to eliminate duplication.
+
+### Python
+
+```bash
+# Generate grouped conv kernels
+python3 codegen/unified_grouped_conv_codegen.py \
+    --output-dir build/generated_kernels \
+    --datatype fp16 --variant forward --ndim-spatial 2
+
+# Build grouped conv examples
+python3 scripts/compile_grouped_conv_examples.py examples/grouped_conv/cpp/01_basic_grouped_conv.cpp
+```
+
+### Key Files
+
+| Component | File |
+|-----------|------|
+| C++ Headers | `include/ck_tile/dispatcher/grouped_conv_*.hpp` |
+| Python Codegen | `codegen/unified_grouped_conv_codegen.py` |
+| Python Utils | `python/grouped_conv_utils.py` |
+| Build Script | `scripts/compile_grouped_conv_examples.py` |
+| Shared Codegen | `codegen/codegen_common.py` |
+| Shared Utils | `python/dispatcher_common.py` |
+
+### Variants
+
+- **Forward** (`grouped_conv_fwd`) - Standard grouped convolution
+- **Backward Data** (`grouped_conv_bwd_data`) - Gradient w.r.t. input
+- **Backward Weight** (`grouped_conv_bwd_weight`) - Gradient w.r.t. weights
+
+### Shared Infrastructure
+
+GEMM and grouped convolution share common code to avoid duplication:
+- `codegen/codegen_common.py` - TileConfig, TraitConfigBase, type mappings, parallel generation, arch-aware expansion
+- `python/dispatcher_common.py` - Path helpers, validation, auto-correction, Colors, phased output
 
 ---
 
diff --git a/dispatcher/bindings/README.md b/dispatcher/bindings/README.md
index 7cda21f6ec..04029d32a9 100644
--- a/dispatcher/bindings/README.md
+++ b/dispatcher/bindings/README.md
@@ -6,13 +6,13 @@ This directory contains language bindings for the CK Tile Dispatcher.
 
 ```
 bindings/
-├── ctypes/              # Python ctypes bindings (C API)
-│   ├── gemm_ctypes_lib.cpp      # GEMM dispatcher C API
-│   ├── conv_ctypes_lib.cpp      # Convolution dispatcher C API (fwd + bwd_data)
-│   ├── conv_bwdw_ctypes_lib.cpp # Convolution backward weight C API
-│   ├── gpu_helper.cpp           # CLI helper for Python
-│   └── CMakeLists.txt
-└── README.md
+|---- ctypes/              # Python ctypes bindings (C API)
+|   |---- gemm_ctypes_lib.cpp      # GEMM dispatcher C API
+|   |---- conv_ctypes_lib.cpp      # Grouped conv dispatcher C API (fwd + bwd_data)
+|   |---- conv_bwdw_ctypes_lib.cpp # Grouped conv backward weight C API (separate library)
+|   |---- gpu_helper.cpp           # CLI helper for Python
+|   +---- CMakeLists.txt
++---- README.md
 ```
 
 ## ctypes Bindings
@@ -65,7 +65,7 @@ lib.dispatcher_cleanup()
 | `dispatcher_export_registry_json()` | Export registry as JSON |
 | `dispatcher_cleanup()` | Release resources |
 
-### Convolution API
+### Grouped Convolution API
 
 | Function | Description |
 |----------|-------------|
@@ -105,5 +105,11 @@ Output is JSON for easy parsing:
 See the examples that use these bindings:
 
 - **GEMM**: `dispatcher/examples/gemm/python/`
-- **Conv**: `dispatcher/examples/conv/python/`
+
+### Grouped Convolution
+
+Grouped convolution C++ headers and Python utilities are in:
+- **C++ Headers**: `dispatcher/include/ck_tile/dispatcher/grouped_conv_*.hpp`
+- **Python Utils**: `dispatcher/python/grouped_conv_utils.py`
+- **Build Script**: `dispatcher/scripts/compile_grouped_conv_examples.py`
 
diff --git a/dispatcher/bindings/ctypes/CMakeLists.txt b/dispatcher/bindings/ctypes/CMakeLists.txt
index 804e5e9bd7..18314017f2 100644
--- a/dispatcher/bindings/ctypes/CMakeLists.txt
+++ b/dispatcher/bindings/ctypes/CMakeLists.txt
@@ -78,7 +78,7 @@ endif()
 # Look for forward kernels
 file(GLOB CONV_FWD_KERNEL_HEADERS "${CMAKE_BINARY_DIR}/generated_kernels/conv_fwd_*.hpp")
 # Look for backward data kernels  
-file(GLOB CONV_BWDD_KERNEL_HEADERS "${CMAKE_BINARY_DIR}/generated_kernels/conv_bwdd_*.hpp")
+file(GLOB CONV_BWDD_KERNEL_HEADERS "${CMAKE_BINARY_DIR}/generated_kernels/conv_bwd_data_*.hpp")
 # Fallback: any conv kernel (for backwards compatibility)
 file(GLOB CONV_KERNEL_HEADERS "${CMAKE_BINARY_DIR}/generated_kernels/conv_*.hpp")
 
@@ -112,7 +112,7 @@ endif()
 # Add backward data kernel if available
 if(CONV_BWDD_KERNEL_HEADERS)
     list(GET CONV_BWDD_KERNEL_HEADERS 0 CONV_BWDD_KERNEL_HEADER)
-    message(STATUS "Found Conv BWD_DATA kernel for ctypes lib: ${CONV_BWDD_KERNEL_HEADER}")
+    message(STATUS "Found Conv BWD_DATA kernel for ctypes lib: ${CONV_BWD_DATA_KERNEL_HEADER}")
     target_compile_options(dispatcher_conv_lib PRIVATE -include ${CONV_BWDD_KERNEL_HEADER})
     target_compile_definitions(dispatcher_conv_lib PRIVATE CONV_BWD_DATA_AVAILABLE)
 endif()
diff --git a/dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp b/dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp
index 09e058f80f..96b4aa3462 100644
--- a/dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp
+++ b/dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp
@@ -53,6 +53,7 @@ struct ConvBwdwProblemC
     int stride_d, stride_h, stride_w;
     int pad_d, pad_h, pad_w;
     int dilation_d, dilation_h, dilation_w;
+    int split_k;
 };
 
 // =============================================================================
@@ -108,8 +109,7 @@ static float run_bwd_weight_impl(const void* input_ptr,
                                                grad_weight_ptr, // wei_ptr = grad_weight (output)
                                                {},              // ds_ptr
                                                grad_output_ptr, // out_ptr = grad_output
-                                               1                // k_batch
-    );
+                                               (prob->split_k > 1) ? prob->split_k : 1);
 
     ck_tile::stream_config stream_cfg{static_cast<hipStream_t>(stream), true, 1, 3, 10};
 
diff --git a/dispatcher/bindings/ctypes/conv_ctypes_lib.cpp b/dispatcher/bindings/ctypes/conv_ctypes_lib.cpp
index d3c64621a7..002219c82e 100644
--- a/dispatcher/bindings/ctypes/conv_ctypes_lib.cpp
+++ b/dispatcher/bindings/ctypes/conv_ctypes_lib.cpp
@@ -1,128 +1,46 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-
-/**
- * Convolution Dispatcher ctypes Library
- *
- * Provides C API for Python ctypes integration.
- * Supports forward convolution. Backward operations require additional headers.
- *
- * REQUIRED: Forward kernel header must be force-included via -include flag.
- * OPTIONAL: Backward kernels can be added with CONV_BWD_DATA_AVAILABLE/CONV_BWD_WEIGHT_AVAILABLE
- *
- * Usage from Python:
- *   lib = ctypes.CDLL("libdispatcher_conv.so")
- *   lib.conv_dispatcher_init()
- *   lib.conv_dispatcher_run(...)
- */
+//
+// Multi-kernel grouped convolution dispatcher for Python ctypes.
+//
+// Supports: forward / backward-data / backward-weight x 2D / 3D
+//
+// The dispatch header (conv_python_dispatch.hpp) is force-included via
+// -include and brings in ALL compiled kernels with these aliases:
+//
+//   2D launchers (from include_all headers):
+//     SelectedConvKernelLauncher      (forward 2D)
+//     SelectedConvBwdDataLauncher     (backward-data 2D)
+//     SelectedConvBwdWeightLauncher   (backward-weight 2D)
+//
+//   3D launchers (from dispatch header):
+//     ConvFwd3dLauncher              (forward 3D)
+//     ConvBwdData3dLauncher          (backward-data 3D)
+//     ConvBwdWeight3dLauncher        (backward-weight 3D)
+//
+// Usage from Python:
+//   lib = ctypes.CDLL("libdispatcher_conv_lib.so")
+//   lib.conv_dispatcher_init()
+//   lib.conv_dispatcher_run(input, weight, output, &problem, stream)
 
 #include <cstring>
-#include <memory>
-#include <vector>
+#include <stdexcept>
 #include <hip/hip_runtime.h>
 
-#include "ck_tile/dispatcher/conv_utils.hpp"
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
 
-using namespace ck_tile::dispatcher;
-
-// Global state (using shared_ptr for safe memory management)
-static std::shared_ptr<ConvRegistry> g_registry     = nullptr;
-static std::shared_ptr<ConvDispatcher> g_dispatcher = nullptr;
-static std::vector<const ConvKernelInstance*> g_kernels;
-
 extern "C" {
 
-// =============================================================================
-// Initialization
-// =============================================================================
-
-int conv_dispatcher_init()
+// =========================================================================
+// Problem definition (matches Python ctypes struct exactly)
+// =========================================================================
+enum ConvDirection
 {
-    if(g_registry)
-        return 0; // Already initialized
-
-    g_registry   = std::make_shared<ConvRegistry>();
-    g_dispatcher = std::make_shared<ConvDispatcher>(g_registry.get());
-
-    // Register kernel configurations using simple ConvKernelSet
-    // (actual kernel launch uses the force-included SelectedConvKernelLauncher)
-    using namespace ck_tile::dispatcher::conv_decl;
-
-    // Forward kernels (required - must be force-included)
-    // Must match: conv_fwd_fp16_nhwgc_2d_compv4_cshuffle_intrawave_128x128x64_2x2x1_32x32x16_dsb
-    ConvKernelSet fwd_set;
-    fwd_set.add(ConvSignature().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
-                ConvAlgorithm()
-                    .tile(128, 128, 64) // tile_m x tile_n x tile_k
-                    .wave(2, 2, 1)
-                    .warp(32, 32, 16)
-                    .pipeline("compv4")
-                    .scheduler("intrawave"),
-                "gfx942");
-    g_registry->register_set(fwd_set, ConvRegistry::Priority::High);
-
-#ifdef CONV_BWD_DATA_AVAILABLE
-    // Backward data kernels
-    // Must match: conv_bwdd_fp16_nhwgc_2d_compv3_cshuffle_intrawave_128x128x64_2x2x1_32x32x16
-    ConvKernelSet bwd_data_set;
-    bwd_data_set.add(ConvSignature().dtype("fp16").layout("nhwgc").conv_type("bwd_data").dims(2),
-                     ConvAlgorithm()
-                         .tile(128, 128, 64) // tile_m x tile_n x tile_k
-                         .wave(2, 2, 1)
-                         .warp(32, 32, 16)
-                         .pipeline("compv3")
-                         .scheduler("intrawave"),
-                     "gfx942");
-    g_registry->register_set(bwd_data_set, ConvRegistry::Priority::High);
-#endif
-
-    return 0;
-}
-
-int conv_dispatcher_cleanup()
-{
-    // shared_ptr automatically handles cleanup when reset
-    g_dispatcher.reset();
-    g_registry.reset();
-    g_kernels.clear();
-    return 0;
-}
-
-// =============================================================================
-// Registry Management
-// =============================================================================
-
-int conv_dispatcher_get_kernel_count()
-{
-    if(!g_registry)
-        return 0;
-    return static_cast<int>(g_registry->size());
-}
-
-int conv_dispatcher_get_kernel_name(int index, char* buffer, int buffer_size)
-{
-    if(index < 0 || !buffer || buffer_size <= 0)
-        return -1;
-
-    if(!g_registry)
-        return -1;
-
-    // Use registry to get kernel names (they are registered with full names)
-    const auto& kernels = g_registry->all_kernels();
-    if(static_cast<size_t>(index) >= kernels.size())
-        return -1;
-
-    const auto* kernel = kernels[index];
-    std::strncpy(buffer, kernel->name().c_str(), buffer_size - 1);
-    buffer[buffer_size - 1] = '\0';
-    return 0;
-}
-
-// =============================================================================
-// Problem Definition
-// =============================================================================
+    CONV_FORWARD    = 0,
+    CONV_BWD_DATA   = 1,
+    CONV_BWD_WEIGHT = 2
+};
 
 struct ConvProblemC
 {
@@ -132,267 +50,33 @@ struct ConvProblemC
     int stride_d, stride_h, stride_w;
     int pad_d, pad_h, pad_w;
     int dilation_d, dilation_h, dilation_w;
-    int direction; // 0=forward, 1=bwd_data, 2=bwd_weight
+    int direction;
+    int split_k;
 };
 
-// =============================================================================
-// Kernel Selection
-// =============================================================================
+// =========================================================================
+// Initialization / lifecycle
+// =========================================================================
+int conv_dispatcher_init() { return 0; }
+int conv_dispatcher_cleanup() { return 0; }
 
-int conv_dispatcher_is_supported(const ConvProblemC* prob)
-{
-    if(!g_registry || !prob)
-        return 0;
-
-    ConvProblem problem;
-    problem.N              = prob->N;
-    problem.G              = prob->G;
-    problem.C              = prob->C;
-    problem.K              = prob->K;
-    problem.input_spatial  = {prob->input_d, prob->input_h, prob->input_w};
-    problem.filter_spatial = {prob->filter_z, prob->filter_y, prob->filter_x};
-    problem.stride         = {prob->stride_d, prob->stride_h, prob->stride_w};
-    problem.padding        = {prob->pad_d, prob->pad_h, prob->pad_w};
-    problem.dilation       = {prob->dilation_d, prob->dilation_h, prob->dilation_w};
-    problem.op             = static_cast<ConvOp>(prob->direction);
-    problem.compute_output_size();
-
-    const auto* kernel = g_dispatcher->select(problem);
-    return kernel ? 1 : 0;
-}
-
-int conv_dispatcher_select_kernel(const ConvProblemC* prob, char* kernel_name, int buffer_size)
-{
-    if(!g_registry || !prob || !kernel_name || buffer_size <= 0)
-        return -1;
-
-    ConvProblem problem;
-    problem.N              = prob->N;
-    problem.G              = prob->G;
-    problem.C              = prob->C;
-    problem.K              = prob->K;
-    problem.input_spatial  = {prob->input_d, prob->input_h, prob->input_w};
-    problem.filter_spatial = {prob->filter_z, prob->filter_y, prob->filter_x};
-    problem.stride         = {prob->stride_d, prob->stride_h, prob->stride_w};
-    problem.padding        = {prob->pad_d, prob->pad_h, prob->pad_w};
-    problem.dilation       = {prob->dilation_d, prob->dilation_h, prob->dilation_w};
-    problem.op             = static_cast<ConvOp>(prob->direction);
-    problem.compute_output_size();
-
-    const auto* kernel = g_dispatcher->select(problem);
-    if(!kernel)
-        return -1;
-
-    std::strncpy(kernel_name, kernel->name().c_str(), buffer_size - 1);
-    kernel_name[buffer_size - 1] = '\0';
-
-    return 0;
-}
-
-// =============================================================================
-// Convolution Execution
-// =============================================================================
-
-// Helper to build ConvParam
-static ck_tile::conv::ConvParam build_conv_param(const ConvProblemC* prob)
-{
-    // Determine if this is 2D or 3D convolution
-    const bool is_3d = (prob->input_d > 1 || prob->filter_z > 1);
-
-    if(is_3d)
-    {
-        // 3D convolution: use all spatial dimensions
-        return ck_tile::conv::ConvParam{3,
-                                        prob->G,
-                                        prob->N,
-                                        prob->K,
-                                        prob->C,
-                                        {prob->filter_z, prob->filter_y, prob->filter_x},
-                                        {prob->input_d, prob->input_h, prob->input_w},
-                                        {prob->stride_d, prob->stride_h, prob->stride_w},
-                                        {prob->dilation_d, prob->dilation_h, prob->dilation_w},
-                                        {prob->pad_d, prob->pad_h, prob->pad_w},
-                                        {prob->pad_d, prob->pad_h, prob->pad_w}};
-    }
-    else
-    {
-        // 2D convolution: only use H, W dimensions
-        return ck_tile::conv::ConvParam{2,
-                                        prob->G,
-                                        prob->N,
-                                        prob->K,
-                                        prob->C,
-                                        {prob->filter_y, prob->filter_x},
-                                        {prob->input_h, prob->input_w},
-                                        {prob->stride_h, prob->stride_w},
-                                        {prob->dilation_h, prob->dilation_w},
-                                        {prob->pad_h, prob->pad_w},
-                                        {prob->pad_h, prob->pad_w}};
-    }
-}
-
-// Forward convolution (required - kernel header must be force-included)
-static float run_forward(const void* input_ptr,
-                         const void* weight_ptr,
-                         void* output_ptr,
-                         const ConvProblemC* prob,
-                         void* stream)
-{
-    auto conv_param = build_conv_param(prob);
-
-    ck_tile::GroupedConvFwdHostArgs<> args(conv_param, input_ptr, weight_ptr, {}, output_ptr, 1);
-
-    ck_tile::stream_config stream_cfg{static_cast<hipStream_t>(stream), true, 1, 3, 10};
-
-    // SelectedConvKernelLauncher is defined in the force-included forward kernel header
-    return SelectedConvKernelLauncher::launch(args, stream_cfg);
-}
-
-#ifdef CONV_BWD_DATA_AVAILABLE
-// Backward data convolution (optional)
-// Computes: grad_input = conv_bwd_data(weight, grad_output)
-//
-// Parameters:
-//   grad_output_ptr: dY - gradient from next layer (const, read-only INPUT)
-//   weight_ptr:      W  - frozen weights (const, read-only INPUT)
-//   grad_input_ptr:  dX - gradient for input (writable, OUTPUT)
-static float run_bwd_data(const void* grad_output_ptr,
-                          const void* weight_ptr,
-                          void* grad_input_ptr,
-                          const ConvProblemC* prob,
-                          void* stream)
-{
-    auto conv_param = build_conv_param(prob);
-
-    // CK Tile API uses tensor POSITION names (from forward pass), not data flow:
-    //   in_ptr  = input tensor position  = grad_input_ptr (dX, OUTPUT of bwd_data)
-    //   wei_ptr = weight tensor          = weight_ptr (W, const)
-    //   out_ptr = output tensor position = grad_output_ptr (dY, INPUT to bwd_data)
-    ck_tile::GroupedConvBwdDataHostArgs args(
-        conv_param, grad_input_ptr, weight_ptr, {}, grad_output_ptr, 1);
-
-    ck_tile::stream_config stream_cfg{static_cast<hipStream_t>(stream), true, 1, 3, 10};
-
-    return SelectedConvBwdDataLauncher::launch(args, stream_cfg);
-}
-#endif
-
-#ifdef CONV_BWD_WEIGHT_AVAILABLE
-// Backward weight convolution (optional)
-// Parameters:
-//   input_ptr:       original forward input X (const, read-only)
-//   grad_output_ptr: gradient from next layer dY (const, read-only)
-//   grad_weight_ptr: gradient of weights dW (writable, OUTPUT)
-static float run_bwd_weight(const void* input_ptr,
-                            const void* grad_output_ptr,
-                            void* grad_weight_ptr,
-                            const ConvProblemC* prob,
-                            void* stream)
-{
-    auto conv_param = build_conv_param(prob);
-
-    // GroupedConvBwdWeightHostArgs constructor order:
-    //   (param, in=X, wei=dW (output), ds, out=dY (input), k_batch)
-    // Note: wei_ptr is the OUTPUT (grad_weight), out_ptr is the INPUT (grad_output)
-    ck_tile::GroupedConvBwdWeightHostArgs args(
-        conv_param, input_ptr, grad_weight_ptr, {}, grad_output_ptr, 1);
-
-    ck_tile::stream_config stream_cfg{static_cast<hipStream_t>(stream), true, 1, 3, 10};
-
-    return SelectedConvBwdWeightLauncher::launch(args, stream_cfg);
-}
-#endif
-
-/**
- * @brief Execute convolution based on direction specified in prob
- *
- * Parameter mapping varies by direction:
- *   Forward (direction=0):
- *     input_ptr  = X (input tensor)
- *     weight_ptr = W (weight tensor)
- *     output_ptr = Y (output buffer)
- *
- *   Backward Data (direction=1):
- *     input_ptr  = dY (grad_output - gradient from next layer)
- *     weight_ptr = W  (weight tensor, frozen)
- *     output_ptr = dX (grad_input buffer)
- *
- *   Backward Weight (direction=2):
- *     input_ptr  = X  (forward input tensor)
- *     weight_ptr = dY (grad_output - gradient from next layer)
- *     output_ptr = dW (grad_weight buffer)
- */
-float conv_dispatcher_run(const void* input_ptr,
-                          const void* weight_ptr,
-                          void* output_ptr,
-                          const ConvProblemC* prob,
-                          void* stream)
-{
-    // Validate all required pointers before kernel launch
-    if(!g_dispatcher || !prob)
-        return -1.0f;
-    if(!input_ptr || !weight_ptr || !output_ptr)
-        return -1.0f; // Null data pointer would cause kernel crash
-
-    // Build problem for kernel selection
-    ConvProblem problem;
-    problem.N              = prob->N;
-    problem.G              = prob->G;
-    problem.C              = prob->C;
-    problem.K              = prob->K;
-    problem.input_spatial  = {prob->input_d, prob->input_h, prob->input_w};
-    problem.filter_spatial = {prob->filter_z, prob->filter_y, prob->filter_x};
-    problem.stride         = {prob->stride_d, prob->stride_h, prob->stride_w};
-    problem.padding        = {prob->pad_d, prob->pad_h, prob->pad_w};
-    problem.dilation       = {prob->dilation_d, prob->dilation_h, prob->dilation_w};
-    problem.op             = static_cast<ConvOp>(prob->direction);
-    problem.compute_output_size();
-
-    // Select kernel
-    const auto* kernel = g_dispatcher->select(problem);
-    if(!kernel)
-        return -1.0f;
-
-    // Dispatch based on direction
-    switch(prob->direction)
-    {
-    case 0: // Forward (always available)
-        return run_forward(input_ptr, weight_ptr, output_ptr, prob, stream);
-
-#ifdef CONV_BWD_DATA_AVAILABLE
-    case 1: // Backward data
-        // Convention: caller passes (grad_output, weight, grad_input_buffer)
-        // in the (input_ptr, weight_ptr, output_ptr) slots respectively.
-        // run_bwd_data expects: (grad_output, weight, grad_input)
-        return run_bwd_data(input_ptr, weight_ptr, output_ptr, prob, stream);
-#endif
-
-#ifdef CONV_BWD_WEIGHT_AVAILABLE
-    case 2: // Backward weight
-        // Convention: caller passes (input, grad_output, grad_weight_buffer)
-        // in the (input_ptr, weight_ptr, output_ptr) slots respectively.
-        // run_bwd_weight expects: (input, grad_output, grad_weight)
-        return run_bwd_weight(input_ptr, weight_ptr, output_ptr, prob, stream);
-#endif
-
-    default: return -1.0f;
-    }
-}
-
-// =============================================================================
-// Info
-// =============================================================================
-
-const char* conv_dispatcher_version() { return "1.0.0"; }
+// =========================================================================
+// Library info
+// =========================================================================
+const char* conv_dispatcher_version() { return "2.0.0"; }
 
 int conv_dispatcher_has_kernels()
 {
-    return 1; // Forward kernel is required
+#if defined(CONV_FWD_2D_AVAILABLE) || defined(CONV_FWD_3D_AVAILABLE)
+    return 1;
+#else
+    return 0;
+#endif
 }
 
 int conv_dispatcher_has_bwd_data()
 {
-#ifdef CONV_BWD_DATA_AVAILABLE
+#if defined(CONV_BWD_DATA_2D_AVAILABLE) || defined(CONV_BWD_DATA_3D_AVAILABLE)
     return 1;
 #else
     return 0;
@@ -401,11 +85,240 @@ int conv_dispatcher_has_bwd_data()
 
 int conv_dispatcher_has_bwd_weight()
 {
-#ifdef CONV_BWD_WEIGHT_AVAILABLE
+#if defined(CONV_BWD_WEIGHT_2D_AVAILABLE) || defined(CONV_BWD_WEIGHT_3D_AVAILABLE)
     return 1;
 #else
     return 0;
 #endif
 }
 
+int conv_dispatcher_get_kernel_count()
+{
+    return CONV_KERNEL_COUNT; // defined in conv_python_dispatch.hpp
+}
+
+int conv_dispatcher_get_kernel_name(int index, char* buffer, int buffer_size)
+{
+    if(!buffer || buffer_size <= 0 || index < 0 || index >= CONV_KERNEL_COUNT)
+        return -1;
+    std::strncpy(buffer, CONV_KERNEL_NAMES[index], buffer_size - 1);
+    buffer[buffer_size - 1] = '\0';
+    return 0;
+}
+
+// =========================================================================
+// Support query
+// =========================================================================
+bool conv_dispatcher_is_supported(const ConvProblemC* prob)
+{
+    if(!prob)
+        return false;
+    const bool is_3d = (prob->input_d > 1 || prob->filter_z > 1);
+    switch(prob->direction)
+    {
+    case CONV_FORWARD:
+#if defined(CONV_FWD_3D_AVAILABLE)
+        if(is_3d)
+            return true;
+#endif
+#if defined(CONV_FWD_2D_AVAILABLE)
+        if(!is_3d)
+            return true;
+#endif
+        return false;
+    case CONV_BWD_DATA:
+#if defined(CONV_BWD_DATA_3D_AVAILABLE)
+        if(is_3d)
+            return true;
+#endif
+#if defined(CONV_BWD_DATA_2D_AVAILABLE)
+        if(!is_3d)
+            return true;
+#endif
+        return false;
+    case CONV_BWD_WEIGHT:
+#if defined(CONV_BWD_WEIGHT_3D_AVAILABLE)
+        if(is_3d)
+            return true;
+#endif
+#if defined(CONV_BWD_WEIGHT_2D_AVAILABLE)
+        if(!is_3d)
+            return true;
+#endif
+        return false;
+    default: return false;
+    }
+}
+
+// =========================================================================
+// ConvParam builders
+// =========================================================================
+static ck_tile::conv::ConvParam make_param_2d(const ConvProblemC* p)
+{
+    return ck_tile::conv::ConvParam{2,
+                                    p->G,
+                                    p->N,
+                                    p->K,
+                                    p->C,
+                                    {p->filter_y, p->filter_x},
+                                    {p->input_h, p->input_w},
+                                    {p->stride_h, p->stride_w},
+                                    {p->dilation_h, p->dilation_w},
+                                    {p->pad_h, p->pad_w},
+                                    {p->pad_h, p->pad_w}};
+}
+
+static ck_tile::conv::ConvParam make_param_3d(const ConvProblemC* p)
+{
+    return ck_tile::conv::ConvParam{3,
+                                    p->G,
+                                    p->N,
+                                    p->K,
+                                    p->C,
+                                    {p->filter_z, p->filter_y, p->filter_x},
+                                    {p->input_d, p->input_h, p->input_w},
+                                    {p->stride_d, p->stride_h, p->stride_w},
+                                    {p->dilation_d, p->dilation_h, p->dilation_w},
+                                    {p->pad_d, p->pad_h, p->pad_w},
+                                    {p->pad_d, p->pad_h, p->pad_w}};
+}
+
+// =========================================================================
+// Kernel launch helpers
+// =========================================================================
+
+#ifdef CONV_FWD_2D_AVAILABLE
+static float
+launch_fwd_2d(const void* in, const void* wei, void* out, const ConvProblemC* p, hipStream_t stream)
+{
+    auto param = make_param_2d(p);
+    ck_tile::GroupedConvFwdHostArgs<> args(param, in, wei, {}, out, 1);
+    ck_tile::stream_config sc{stream, true, 1, 3, 10};
+    return SelectedConvKernelLauncher::launch(args, sc);
+}
+#endif
+
+#ifdef CONV_FWD_3D_AVAILABLE
+static float
+launch_fwd_3d(const void* in, const void* wei, void* out, const ConvProblemC* p, hipStream_t stream)
+{
+    auto param = make_param_3d(p);
+    ck_tile::GroupedConvFwdHostArgs<> args(param, in, wei, {}, out, 1);
+    ck_tile::stream_config sc{stream, true, 1, 3, 10};
+    return ConvFwd3dLauncher::launch(args, sc);
+}
+#endif
+
+#ifdef CONV_BWD_DATA_2D_AVAILABLE
+static float launch_bwd_data_2d(
+    const void* dy, const void* wei, void* dx, const ConvProblemC* p, hipStream_t stream)
+{
+    auto param = make_param_2d(p);
+    ck_tile::GroupedConvBwdDataHostArgs args(param, dx, wei, {}, dy, 1);
+    ck_tile::stream_config sc{stream, true, 1, 3, 10};
+    return SelectedConvBwdDataLauncher::launch(args, sc);
+}
+#endif
+
+#ifdef CONV_BWD_DATA_3D_AVAILABLE
+static float launch_bwd_data_3d(
+    const void* dy, const void* wei, void* dx, const ConvProblemC* p, hipStream_t stream)
+{
+    auto param = make_param_3d(p);
+    ck_tile::GroupedConvBwdDataHostArgs args(param, dx, wei, {}, dy, 1);
+    ck_tile::stream_config sc{stream, true, 1, 3, 10};
+    return ConvBwdData3dLauncher::launch(args, sc);
+}
+#endif
+
+#ifdef CONV_BWD_WEIGHT_2D_AVAILABLE
+static float launch_bwd_weight_2d(
+    const void* x, const void* dy, void* dw, const ConvProblemC* p, hipStream_t stream)
+{
+    auto param        = make_param_2d(p);
+    const int k_batch = (p->split_k > 1) ? p->split_k : 1;
+    ck_tile::GroupedConvBwdWeightHostArgs args(param, x, dw, {}, dy, k_batch);
+    ck_tile::stream_config sc{stream, true, 1, 3, 10};
+    return SelectedConvBwdWeightLauncher::launch(args, sc);
+}
+#endif
+
+#ifdef CONV_BWD_WEIGHT_3D_AVAILABLE
+static float launch_bwd_weight_3d(
+    const void* x, const void* dy, void* dw, const ConvProblemC* p, hipStream_t stream)
+{
+    auto param        = make_param_3d(p);
+    const int k_batch = (p->split_k > 1) ? p->split_k : 1;
+    ck_tile::GroupedConvBwdWeightHostArgs args(param, x, dw, {}, dy, k_batch);
+    ck_tile::stream_config sc{stream, true, 1, 3, 10};
+    return ConvBwdWeight3dLauncher::launch(args, sc);
+}
+#endif
+
+// =========================================================================
+// Main dispatch
+//
+//  direction=0 (forward):     a=X(input),      b=W(weight),      c=Y(output)
+//  direction=1 (bwd_data):    a=dY(grad_out),  b=W(weight),      c=dX(grad_in)
+//  direction=2 (bwd_weight):  a=X(input),      b=dY(grad_out),   c=dW(grad_wei)
+// =========================================================================
+float conv_dispatcher_run(
+    const void* a_ptr, const void* b_ptr, void* c_ptr, const ConvProblemC* prob, void* stream)
+{
+    if(!prob || !a_ptr || !b_ptr || !c_ptr)
+        return -1.0f;
+
+    const bool is_3d       = (prob->input_d > 1 || prob->filter_z > 1);
+    hipStream_t hip_stream = static_cast<hipStream_t>(stream);
+
+    try
+    {
+        switch(prob->direction)
+        {
+        case CONV_FORWARD:
+#ifdef CONV_FWD_3D_AVAILABLE
+            if(is_3d)
+                return launch_fwd_3d(a_ptr, b_ptr, c_ptr, prob, hip_stream);
+#endif
+#ifdef CONV_FWD_2D_AVAILABLE
+            if(!is_3d)
+                return launch_fwd_2d(a_ptr, b_ptr, c_ptr, prob, hip_stream);
+#endif
+            return -2.0f;
+
+        case CONV_BWD_DATA:
+#ifdef CONV_BWD_DATA_3D_AVAILABLE
+            if(is_3d)
+                return launch_bwd_data_3d(a_ptr, b_ptr, c_ptr, prob, hip_stream);
+#endif
+#ifdef CONV_BWD_DATA_2D_AVAILABLE
+            if(!is_3d)
+                return launch_bwd_data_2d(a_ptr, b_ptr, c_ptr, prob, hip_stream);
+#endif
+            return -2.0f;
+
+        case CONV_BWD_WEIGHT:
+#ifdef CONV_BWD_WEIGHT_3D_AVAILABLE
+            if(is_3d)
+                return launch_bwd_weight_3d(a_ptr, b_ptr, c_ptr, prob, hip_stream);
+#endif
+#ifdef CONV_BWD_WEIGHT_2D_AVAILABLE
+            if(!is_3d)
+                return launch_bwd_weight_2d(a_ptr, b_ptr, c_ptr, prob, hip_stream);
+#endif
+            return -2.0f;
+
+        default: return -1.0f;
+        }
+    }
+    catch(const std::exception&)
+    {
+        return -3.0f; // Kernel rejected args (e.g. unsupported tile/channel combo)
+    }
+    catch(...)
+    {
+        return -3.0f;
+    }
+}
+
 } // extern "C"
diff --git a/dispatcher/codegen/ADDING_NEW_GPU.md b/dispatcher/codegen/ADDING_NEW_GPU.md
index 0bd2966a85..664b59b6b1 100644
--- a/dispatcher/codegen/ADDING_NEW_GPU.md
+++ b/dispatcher/codegen/ADDING_NEW_GPU.md
@@ -9,8 +9,8 @@ Guide for adding support for a new AMD GPU architecture to the CK Tile Dispatche
 The dispatcher uses `arch_specs.json` as the **single source of truth** for GPU specifications:
 
 ```
-arch_specs.json → generate_arch_specs.py → arch_specs_generated.py (Python)
-                                        → arch_specs_generated.hpp (C++)
+arch_specs.json -> generate_arch_specs.py -> arch_specs_generated.py (Python)
+                                        -> arch_specs_generated.hpp (C++)
 ```
 
 ## Quick Start
@@ -175,14 +175,14 @@ for error in result.errors:
 
 ```
 codegen/
-├── arch_specs.json              # Single source of truth (EDIT THIS)
-├── generate_arch_specs.py       # Generator script
-├── arch_specs_generated.py      # Generated Python module
-└── ADDING_NEW_GPU.md           # This file
+|---- arch_specs.json              # Single source of truth (EDIT THIS)
+|---- generate_arch_specs.py       # Generator script
+|---- arch_specs_generated.py      # Generated Python module
++---- ADDING_NEW_GPU.md           # This file
 
 include/ck_tile/dispatcher/
-├── arch_specs_generated.hpp     # Generated C++ header
-└── arch_filter.hpp              # C++ filter
+|---- arch_specs_generated.hpp     # Generated C++ header
++---- arch_filter.hpp              # C++ filter
 ```
 
 ## Best Practices
diff --git a/dispatcher/codegen/README.md b/dispatcher/codegen/README.md
index 2d753924f5..40a9b7b8c1 100644
--- a/dispatcher/codegen/README.md
+++ b/dispatcher/codegen/README.md
@@ -1,11 +1,22 @@
-# CK Tile GEMM Unified Code Generator
+# CK Tile Unified Code Generators
 
-Single source of truth for all GEMM kernel generation.
+Single source of truth for GEMM and Grouped Convolution kernel generation.
 
 > **See also:** [Main Dispatcher README](../README.md) for installation and core concepts.
 
+## Shared Infrastructure
+
+Both GEMM and Grouped Conv generators share common code via `codegen_common.py`:
+- `TileConfig` - Dataclass for tile dimensions
+- `TraitConfigBase` - Base for kernel trait configurations with arch-aware validation
+- `CommonTypeMappings` - Dtype-to-C++ type mappings
+- `parallel_generate()` - Parallel kernel generation with per-kernel progress logging
+- Arch-aware expansion helpers (`valid_wave_configs`, `valid_warp_configs`, etc.)
+
 ## Quick Start
 
+### GEMM
+
 ```bash
 cd dispatcher/codegen
 
@@ -22,6 +33,25 @@ python3 unified_gemm_codegen.py \
     --variants standard preshuffle multi_d
 ```
 
+### Grouped Convolution
+
+```bash
+cd dispatcher/codegen
+
+# Generate forward FP16 grouped conv kernels
+python3 unified_grouped_conv_codegen.py \
+    --output-dir ../build/generated_kernels \
+    --datatype fp16 \
+    --variant forward \
+    --ndim-spatial 2
+
+# Generate backward data kernels
+python3 unified_grouped_conv_codegen.py \
+    --output-dir ../build/generated_kernels \
+    --variant backward_data \
+    --ndim-spatial 2
+```
+
 ## Using from Python
 
 ```python
@@ -58,13 +88,13 @@ results = codegen.generate_all()
 ## Variants
 
 ### Standard
-Basic GEMM: `C = A × B`
+Basic GEMM: `C = A x B`
 
 ### PreShuffle
 Optimized weight access with LDS pre-shuffling. Best for large matrices.
 
 ### Multi-D
-Element-wise fusion: `C = op(A × B + D0 + D1 + ...)`
+Element-wise fusion: `C = op(A x B + D0 + D1 + ...)`
 
 Supported ops: `PassThrough`, `MultiDAdd`, `Relu`, `Gelu`, `Sigmoid`, `Tanh`
 
@@ -72,10 +102,11 @@ Supported ops: `PassThrough`, `MultiDAdd`, `Relu`, `Gelu`, `Sigmoid`, `Tanh`
 
 ```
 generated_kernels/
-├── gemm_fp16_rcr_compv4_..._128x128x32_....hpp
-├── gemm_fp16_rcr_compv4_..._preshuffle.hpp
-├── gemm_fp16_rcr_compv4_..._multid_Relu_d1.hpp
-└── ...
+|---- gemm_fp16_rcr_compv4_..._128x128x32_....hpp          # GEMM kernels
+|---- gemm_fp16_rcr_compv4_..._preshuffle.hpp
+|---- gemm_fp16_rcr_compv4_..._multid_Relu_d1.hpp
+|---- grouped_conv_fwd_fp16_nhwgc_..._128x128x32_....hpp   # Grouped conv kernels
++---- ...
 ```
 
 ## Configuration Files
diff --git a/dispatcher/codegen/codegen_common.py b/dispatcher/codegen/codegen_common.py
new file mode 100644
index 0000000000..4e9e8de1b3
--- /dev/null
+++ b/dispatcher/codegen/codegen_common.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Shared codegen infrastructure for GEMM and grouped convolution code generators.
+
+Extracted from unified_gemm_codegen.py + arch-aware expansion helpers from conv.
+Both unified_gemm_codegen.py and unified_grouped_conv_codegen.py import from here
+to eliminate duplication.
+"""
+
+import logging
+import concurrent.futures
+from dataclasses import dataclass
+from typing import (
+    Callable,
+    ClassVar,
+    Dict,
+    FrozenSet,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+)
+
+log = logging.getLogger(__name__)
+
+T = TypeVar("T")
+R = TypeVar("R")
+
+ANY_INT = -1
+
+
+# ============================================================================
+# Tile and Trait Configuration (shared between GEMM and Conv)
+# ============================================================================
+
+
+@dataclass
+class TileConfig:
+    """Tile configuration parameters shared by GEMM and grouped conv."""
+
+    tile_m: int
+    tile_n: int
+    tile_k: int
+    warp_m: int
+    warp_n: int
+    warp_k: int
+    warp_tile_m: int
+    warp_tile_n: int
+    warp_tile_k: int
+
+    def is_valid(self) -> bool:
+        if self.tile_m <= 0 or self.tile_n <= 0 or self.tile_k <= 0:
+            return False
+        return (
+            self.tile_m % (self.warp_m * self.warp_tile_m) == 0
+            and self.tile_n % (self.warp_n * self.warp_tile_n) == 0
+            and self.tile_k % (self.warp_k * self.warp_tile_k) == 0
+        )
+
+
+@dataclass
+class TraitConfigBase:
+    """
+    Base kernel trait configuration shared by GEMM and grouped conv.
+
+    GEMM extends this with ``persistent``; grouped conv extends with
+    ``double_smem_buffer`` and ``num_groups_to_merge``.
+    """
+
+    pipeline: str  # mem, compv3, compv4, compv5, ...
+    epilogue: str  # cshuffle, default
+    scheduler: str  # intrawave, interwave
+    pad_m: bool
+    pad_n: bool
+    pad_k: bool
+
+    # Unsupported (pipeline, epilogue, scheduler) combinations.
+    # Only 'mem' and 'basic_v1' pipelines support interwave; all compute
+    # pipelines (compv3/v4/v5/v6/async) only support intrawave.
+    _UNSUPPORTED: ClassVar[FrozenSet] = frozenset(
+        {
+            ("compv3", "cshuffle", "interwave"),
+            ("compv3", "default", "interwave"),
+            ("compv4", "cshuffle", "interwave"),
+            ("compv4", "default", "interwave"),
+            ("compv5", "cshuffle", "interwave"),
+            ("compv5", "default", "interwave"),
+            ("compv6", "cshuffle", "interwave"),
+            ("compv6", "default", "interwave"),
+            ("comp_async", "cshuffle", "interwave"),
+            ("comp_async", "default", "interwave"),
+            ("basic_async_v1", "cshuffle", "interwave"),
+            ("basic_async_v1", "default", "interwave"),
+        }
+    )
+
+    def is_valid(self) -> bool:
+        return (self.pipeline, self.epilogue, self.scheduler) not in self._UNSUPPORTED
+
+
+# ============================================================================
+# Type Mappings (centralized for both GEMM and conv codegen)
+# ============================================================================
+
+
+class CommonTypeMappings:
+    """Centralized type mappings shared by GEMM and grouped conv codegen."""
+
+    DTYPE_TO_CK = {
+        "fp16": "fp16_t",
+        "bf16": "bf16_t",
+        "fp32": "float",
+        "fp8": "fp8_t",
+        "bf8": "bf8_t",
+        "int8": "int8_t",
+    }
+
+    DTYPE_TO_CK_QUALIFIED = {
+        "fp16": "ck_tile::fp16_t",
+        "bf16": "ck_tile::bf16_t",
+        "fp32": "float",
+        "fp8": "ck_tile::fp8_t",
+        "bf8": "ck_tile::bf8_t",
+        "int8": "int8_t",
+    }
+
+    DTYPE_TO_DISPATCHER = {
+        "fp16": "DataType::FP16",
+        "bf16": "DataType::BF16",
+        "fp32": "DataType::FP32",
+        "fp8": "DataType::FP8",
+        "bf8": "DataType::BF8",
+        "int8": "DataType::INT8",
+    }
+
+    # GEMM-specific layout mappings ("r"/"c" for row/column major).
+    # Convolution layouts (NHWGC, GKYXC, etc.) are handled by
+    # unified_grouped_conv_codegen.py via GroupedConvLayout / GroupedConvTypeMappings.
+    GEMM_LAYOUT_TO_CK = {
+        "r": "tensor_layout::gemm::RowMajor",
+        "c": "tensor_layout::gemm::ColumnMajor",
+    }
+    LAYOUT_TO_CK = GEMM_LAYOUT_TO_CK  # backward compat alias
+
+    GEMM_LAYOUT_TO_DISPATCHER = {
+        "r": "LayoutTag::RowMajor",
+        "c": "LayoutTag::ColMajor",
+    }
+    LAYOUT_TO_DISPATCHER = GEMM_LAYOUT_TO_DISPATCHER  # backward compat alias
+
+    # GEMM-only pipeline mappings (used by unified_gemm_codegen.py).
+    # Convolution pipelines are in GroupedConvTypeMappings
+    # (unified_grouped_conv_codegen.py). CK Tile conv supports:
+    # BASIC_V1, Mem, CompV3, CompV4, CompV5, CompV6, ASYNC_V1, ASYNC_V4.
+    # The dispatcher currently generates: mem, compv3, compv4.
+    # preshufflev2 is GEMM-only (weight pre-shuffle for GEMM, not conv).
+    PIPELINE_TO_CK = {
+        "mem": "GemmPipelineAgBgCrMem",
+        "compv3": "GemmPipelineAgBgCrCompV3",
+        "compv4": "GemmPipelineAgBgCrCompV4",
+        "compv5": "GemmPipelineAgBgCrCompV5",
+        "preshufflev2": "WeightPreshufflePipelineAGmemBGmemCRegV2",
+    }
+
+    PIPELINE_TO_BASE = {
+        "mem": "BaseGemmPipelineAgBgCrMem",
+        "compv3": "BaseGemmPipelineAgBgCrCompV3",
+        "compv4": "BaseGemmPipelineAgBgCrCompV4",
+        "compv5": "BaseGemmPipelineAgBgCrCompV5",
+        "preshufflev2": "BaseWeightPreshufflePipelineAGmemBGmemCRegV2",
+    }
+
+    PIPELINE_TO_DISPATCHER = {
+        "mem": "Pipeline::Mem",
+        "compv3": "Pipeline::CompV3",
+        "compv4": "Pipeline::CompV4",
+        "compv5": "Pipeline::CompV5",
+        "preshufflev2": "Pipeline::PreShuffleV2",
+    }
+
+    SCHEDULER_TO_CK = {
+        "intrawave": "GemmPipelineScheduler::Intrawave",
+        "interwave": "GemmPipelineScheduler::Interwave",
+        "default": "GemmPipelineScheduler::Default",
+    }
+
+    SCHEDULER_TO_DISPATCHER = {
+        "intrawave": "Scheduler::Intrawave",
+        "interwave": "Scheduler::Interwave",
+        "default": "Scheduler::Auto",
+    }
+
+    EPILOGUE_TO_DISPATCHER = {
+        "cshuffle": "Epilogue::CShuffle",
+        "default": "Epilogue::Default",
+    }
+
+    @staticmethod
+    def get_output_dtype(dtype: str) -> str:
+        """Get output datatype (fp8/bf8 -> fp16)."""
+        return "fp16" if dtype in ("fp8", "bf8") else dtype
+
+
+# ============================================================================
+# Code Generation Helpers
+# ============================================================================
+
+
+def generate_cpp_compilation_unit(kernel_name: str) -> str:
+    """Generate a .cpp compilation unit that includes a kernel header.
+
+    This is the standard pattern: one .cpp per kernel that just includes
+    the generated .hpp header, causing template instantiation.
+    """
+    return (
+        f"// Auto-generated compilation unit for {kernel_name}\n"
+        f'#include "{kernel_name}.hpp"\n'
+    )
+
+
+def parallel_generate(
+    generate_fn: Callable[[T], R],
+    items: Sequence[T],
+    parallel: bool = True,
+) -> List[R]:
+    """Run ``generate_fn`` over ``items``, optionally in parallel.
+
+    Logs per-item progress (best-of-conv pattern).
+    Returns a flat list of results in completion order.
+    """
+    results: List[R] = []
+    if not items:
+        return results
+
+    if parallel and len(items) > 1:
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = {executor.submit(generate_fn, item): item for item in items}
+            for future in concurrent.futures.as_completed(futures):
+                result = future.result()
+                results.append(result)
+                log.info("Generated: %s", futures[future])
+    else:
+        for item in items:
+            result = generate_fn(item)
+            results.append(result)
+            log.info("Generated: %s", item)
+
+    return results
+
+
+# ============================================================================
+# Arch-Aware Expansion Helpers (adopted from conv kernel_decl.hpp)
+# ============================================================================
+
+# These load from arch_specs_generated when available, falling back to
+# hardcoded defaults that match the most common arch (gfx942).
+
+_arch_data_cache: Optional[Dict] = None
+
+
+def _get_arch_data() -> Dict:
+    """Load arch filter data, with caching."""
+    global _arch_data_cache
+    if _arch_data_cache is not None:
+        return _arch_data_cache
+
+    try:
+        from arch_specs_generated import (
+            WARP_SUPPORTED_COMBINATIONS,
+            WARP_TILE_SUPPORTED_COMBINATIONS,
+            TRAIT_UNSUPPORTED_COMBINATIONS,
+            get_supported_archs,
+        )
+
+        _arch_data_cache = {
+            "warp_combos": WARP_SUPPORTED_COMBINATIONS,
+            "warp_tile_combos": WARP_TILE_SUPPORTED_COMBINATIONS,
+            "trait_unsupported": TRAIT_UNSUPPORTED_COMBINATIONS,
+            "supported_archs": get_supported_archs(),
+        }
+    except ImportError:
+        _arch_data_cache = {
+            "warp_combos": {
+                "gfx942": [[1, 4, 1], [2, 2, 1], [4, 1, 1]],
+                "gfx90a": [[1, 4, 1], [2, 2, 1], [4, 1, 1]],
+            },
+            "warp_tile_combos": {
+                "gfx942": {"fp16_fp16_fp32": [[16, 16, 16], [32, 32, 16]]},
+                "gfx90a": {"fp16_fp16_fp32": [[16, 16, 16], [32, 32, 16]]},
+            },
+            "trait_unsupported": {
+                ("compv3", "cshuffle", "interwave"),
+                ("compv4", "cshuffle", "interwave"),
+            },
+            "supported_archs": ["gfx90a", "gfx942", "gfx950"],
+        }
+    return _arch_data_cache
+
+
+def valid_wave_configs(arch: str) -> List[List[int]]:
+    """Return valid [wave_m, wave_n, wave_k] combos for *arch*."""
+    data = _get_arch_data()
+    return data["warp_combos"].get(arch, [[2, 2, 1]])
+
+
+def valid_warp_configs(arch: str, dtype: str) -> List[List[int]]:
+    """Return valid [warp_tile_m, warp_tile_n, warp_tile_k] combos for *arch*/*dtype*.
+
+    The dtype key is constructed as ``{dtype}_{dtype}_{acc}`` where acc is
+    fp32 for float types and int32 for int8.
+    """
+    data = _get_arch_data()
+    acc = "int32" if dtype == "int8" else "fp32"
+    dtype_key = f"{dtype}_{dtype}_{acc}"
+    arch_tiles = data["warp_tile_combos"].get(arch, {})
+    return arch_tiles.get(dtype_key, [[32, 32, 16]])
+
+
+def valid_trait_configs() -> List[Tuple[str, str]]:
+    """Return valid (pipeline, scheduler) pairs.
+
+    Compute pipelines only support intrawave; mem supports both.
+    """
+    return [
+        ("compv3", "intrawave"),
+        ("compv4", "intrawave"),
+        ("compv5", "intrawave"),
+        ("mem", "intrawave"),
+        ("mem", "interwave"),
+    ]
+
+
+def needs_wave_expansion(config: dict) -> bool:
+    """True if wave_m or wave_n is a wildcard (ANY_INT = -1)."""
+    return config.get("wave_m", 2) == ANY_INT or config.get("wave_n", 2) == ANY_INT
+
+
+def needs_warp_expansion(config: dict) -> bool:
+    """True if warp_m or warp_n is a wildcard (ANY_INT = -1)."""
+    return config.get("warp_m", 32) == ANY_INT or config.get("warp_n", 32) == ANY_INT
+
+
+def needs_pipeline_expansion(config: dict) -> bool:
+    """True if pipeline is a wildcard (\"*\")."""
+    return config.get("pipeline", "compv4") == "*"
diff --git a/dispatcher/codegen/generate_dispatcher_registration.py b/dispatcher/codegen/generate_dispatcher_registration.py
index 024ec4a7c8..8e8b67376c 100644
--- a/dispatcher/codegen/generate_dispatcher_registration.py
+++ b/dispatcher/codegen/generate_dispatcher_registration.py
@@ -109,7 +109,7 @@ inline void register_all_kernels()
 """
 
     output_file.write_text(content)
-    print(f"✓ Generated registration header: {output_file}")
+    print(f"OK Generated registration header: {output_file}")
 
 
 def generate_registration_cpp(kernels: List[KernelConfig], output_file: Path):
@@ -143,7 +143,7 @@ namespace generated {
 """
 
     output_file.write_text(content)
-    print(f"✓ Generated registration implementation: {output_file}")
+    print(f"OK Generated registration implementation: {output_file}")
 
 
 def generate_kernel_wrapper_header(kernel: KernelConfig, output_dir: Path):
@@ -414,8 +414,8 @@ def main():
     with open(manifest_output, "w") as f:
         json.dump(manifest_data, f, indent=2)
 
-    print(f"✓ Generated manifest: {manifest_output}")
-    print("\n✓ Registration code generation complete!")
+    print(f"OK Generated manifest: {manifest_output}")
+    print("\nOK Registration code generation complete!")
     print(f"  Total kernels: {len(kernels)}")
     print("  Output files:")
     print(f"    - {registration_header}")
diff --git a/dispatcher/codegen/generate_kernel_wrappers.py b/dispatcher/codegen/generate_kernel_wrappers.py
index 53a9bff3ed..e11bd7a0a5 100644
--- a/dispatcher/codegen/generate_kernel_wrappers.py
+++ b/dispatcher/codegen/generate_kernel_wrappers.py
@@ -17,10 +17,10 @@ Usage:
 
 Output structure:
     build/kernel_wrappers/
-    ├── gemm_fp16_rcr_128x128x32.cpp
-    ├── gemm_fp16_rcr_256x256x64.cpp
-    ├── conv_fwd_fp16_2d_128x128.cpp
-    └── ...
+    |---- gemm_fp16_rcr_128x128x32.cpp
+    |---- gemm_fp16_rcr_256x256x64.cpp
+    |---- conv_fwd_fp16_2d_128x128.cpp
+    +---- ...
 
 Each .cpp simply includes its corresponding .hpp and forces symbol emission.
 """
diff --git a/dispatcher/codegen/kernel_config_loader.py b/dispatcher/codegen/kernel_config_loader.py
index 537fc40581..980b4e5fd0 100644
--- a/dispatcher/codegen/kernel_config_loader.py
+++ b/dispatcher/codegen/kernel_config_loader.py
@@ -359,8 +359,8 @@ class ConvTraitConfig:
 
 
 @dataclass
-class ConvKernelConfig:
-    """Complete convolution kernel configuration"""
+class GroupedConvKernelConfig:
+    """Complete grouped convolution kernel configuration"""
 
     tile: ConvTileConfig = field(default_factory=ConvTileConfig)
     trait: ConvTraitConfig = field(default_factory=ConvTraitConfig)
@@ -419,7 +419,11 @@ class ConvKernelConfig:
 
     def kernel_name(self) -> str:
         """Generate kernel name from config"""
-        variant_map = {"forward": "fwd", "bwd_data": "bwdd", "bwd_weight": "bwdw"}
+        variant_map = {
+            "forward": "fwd",
+            "bwd_data": "bwd_data",
+            "bwd_weight": "bwd_weight",
+        }
         var_str = variant_map.get(self.variant, self.variant)
 
         name = f"conv_{var_str}_{self.dtype_input}_{self.ndim}d"
@@ -433,11 +437,11 @@ class ConvKernelConfig:
 
 
 @dataclass
-class ConvKernelConfigSet:
+class GroupedConvKernelConfigSet:
     """A set of convolution kernel configurations loaded from JSON"""
 
     name: str = "default"
-    configs: List[ConvKernelConfig] = field(default_factory=list)
+    configs: List[GroupedConvKernelConfig] = field(default_factory=list)
 
     # Tile parameter ranges
     tile_m_values: List[int] = field(default_factory=lambda: [128])
@@ -481,7 +485,7 @@ class ConvKernelConfigSet:
     layout: str = "nhwgc"
     gpu_targets: List[str] = field(default_factory=lambda: ["gfx942"])
 
-    def generate_configs(self) -> Iterator[ConvKernelConfig]:
+    def generate_configs(self) -> Iterator[GroupedConvKernelConfig]:
         """Generate all kernel configurations (cartesian product)"""
         # Tile parameters
         tile_params = itertools.product(
@@ -548,7 +552,7 @@ class ConvKernelConfigSet:
                             double_smem_buffer=trait[6],
                             num_groups_to_merge=trait[7],
                         )
-                        yield ConvKernelConfig(
+                        yield GroupedConvKernelConfig(
                             tile=tile_cfg,
                             trait=trait_cfg,
                             dtype_input=self.dtype_input,
@@ -599,7 +603,9 @@ class ConvKernelConfigSet:
         return tile_count * trait_count * extra_count * len(self.gpu_targets)
 
 
-def load_conv_kernel_configs(json_path: str | Path) -> ConvKernelConfigSet:
+def load_grouped_conv_kernel_configs(
+    json_path: str | Path,
+) -> GroupedConvKernelConfigSet:
     """
     Load convolution kernel configurations from a JSON file.
 
@@ -607,14 +613,14 @@ def load_conv_kernel_configs(json_path: str | Path) -> ConvKernelConfigSet:
         json_path: Path to JSON configuration file
 
     Returns:
-        ConvKernelConfigSet with all parameter values loaded
+        GroupedConvKernelConfigSet with all parameter values loaded
     """
     json_path = Path(json_path)
 
     with open(json_path) as f:
         data = json.load(f)
 
-    config_set = ConvKernelConfigSet()
+    config_set = GroupedConvKernelConfigSet()
 
     # Name
     config_set.name = data.get("kernel_set_name", json_path.stem)
@@ -680,15 +686,15 @@ def load_conv_kernel_configs(json_path: str | Path) -> ConvKernelConfigSet:
 
 
 def generate_cpp_conv_kernel_set_declaration(
-    config_set: ConvKernelConfigSet,
+    config_set: GroupedConvKernelConfigSet,
     set_name: Optional[str] = None,
 ) -> str:
     """
-    Generate C++ DECL_CONV_KERNEL_SET code from a ConvKernelConfigSet.
+    Generate C++ DECL_GROUPED_CONV_KERNEL_SET code from a GroupedConvKernelConfigSet.
     """
     name = set_name or config_set.name
 
-    lines = [f"DECL_CONV_KERNEL_SET({name},"]
+    lines = [f"DECL_GROUPED_CONV_KERNEL_SET({name},"]
 
     for config in config_set.generate_configs():
         line = f'    .add("{config.dtype_input}", "{config.variant}", {config.ndim}, '
diff --git a/dispatcher/codegen/unified_gemm_codegen.py b/dispatcher/codegen/unified_gemm_codegen.py
index b0dd961be7..a818cec83e 100755
--- a/dispatcher/codegen/unified_gemm_codegen.py
+++ b/dispatcher/codegen/unified_gemm_codegen.py
@@ -7,7 +7,7 @@
 Unified GEMM Code Generator - Single Source of Truth
 
 This is THE unified code generator for all GEMM kernel variants:
-- Standard GEMM (C = A × B)
+- Standard GEMM (C = A x B)
 - Preshuffle GEMM (optimized weight access)
 - Multi-D GEMM (element-wise fusion)
 
@@ -25,6 +25,12 @@ from dataclasses import dataclass, asdict
 from enum import Enum
 import concurrent.futures
 
+from codegen_common import (
+    TileConfig,
+    TraitConfigBase,
+    CommonTypeMappings as TypeMappings,
+)
+
 # Import architecture filter for GPU-specific validation
 try:
     from arch_filter import ArchFilter, KernelConfig as ArchKernelConfig, OperatorType
@@ -194,62 +200,14 @@ class GemmVariant(Enum):
     MULTI_D = "multi_d"
 
 
-@dataclass
-class TileConfig:
-    """Tile configuration parameters"""
-
-    tile_m: int
-    tile_n: int
-    tile_k: int
-    warp_m: int
-    warp_n: int
-    warp_k: int
-    warp_tile_m: int
-    warp_tile_n: int
-    warp_tile_k: int
-
-    def is_valid(self) -> bool:
-        """Validate tile configuration"""
-        return (
-            self.tile_m % (self.warp_m * self.warp_tile_m) == 0
-            and self.tile_n % (self.warp_n * self.warp_tile_n) == 0
-            and self.tile_k % (self.warp_k * self.warp_tile_k) == 0
-            and self.tile_m > 0
-            and self.tile_n > 0
-            and self.tile_k > 0
-        )
+# TileConfig imported from codegen_common
 
 
 @dataclass
-class TraitConfig:
-    """Kernel trait configuration"""
+class TraitConfig(TraitConfigBase):
+    """GEMM-specific trait configuration extending TraitConfigBase with persistent mode."""
 
-    pipeline: str  # mem, compv3, compv4
-    epilogue: str  # default, cshuffle
-    scheduler: str  # intrawave, interwave
-    pad_m: bool
-    pad_n: bool
-    pad_k: bool
-    persistent: bool
-
-    def is_valid(self) -> bool:
-        """Check if trait combination is valid"""
-        # Unsupported combinations
-        # Only 'mem' pipeline supports interwave scheduler.
-        # All compute pipelines (compv3/v4/v5/v6/async) only support intrawave.
-        unsupported = {
-            ("compv3", "cshuffle", "interwave"),
-            ("compv3", "default", "interwave"),
-            ("compv4", "cshuffle", "interwave"),
-            ("compv4", "default", "interwave"),
-            ("compv5", "cshuffle", "interwave"),
-            ("compv5", "default", "interwave"),
-            ("compv6", "cshuffle", "interwave"),
-            ("compv6", "default", "interwave"),
-            ("comp_async", "cshuffle", "interwave"),
-            ("comp_async", "default", "interwave"),
-        }
-        return (self.pipeline, self.epilogue, self.scheduler) not in unsupported
+    persistent: bool = False
 
 
 @dataclass
@@ -345,89 +303,7 @@ class KernelConfig:
 # ============================================================================
 
 
-class TypeMappings:
-    """Centralized type mappings for code generation"""
-
-    DTYPE_TO_CK = {
-        "fp16": "fp16_t",
-        "bf16": "bf16_t",
-        "fp32": "float",
-        "fp8": "fp8_t",
-        "bf8": "bf8_t",
-        "int8": "int8_t",
-    }
-
-    # Fully-qualified types for use outside of 'using namespace ck_tile' scope
-    DTYPE_TO_CK_QUALIFIED = {
-        "fp16": "ck_tile::fp16_t",
-        "bf16": "ck_tile::bf16_t",
-        "fp32": "float",  # Built-in type, no namespace
-        "fp8": "ck_tile::fp8_t",
-        "bf8": "ck_tile::bf8_t",
-        "int8": "int8_t",  # Built-in type
-    }
-
-    DTYPE_TO_DISPATCHER = {
-        "fp16": "DataType::FP16",
-        "bf16": "DataType::BF16",
-        "fp32": "DataType::FP32",
-        "fp8": "DataType::FP8",
-        "bf8": "DataType::BF8",
-        "int8": "DataType::INT8",
-    }
-
-    LAYOUT_TO_CK = {
-        "r": "tensor_layout::gemm::RowMajor",
-        "c": "tensor_layout::gemm::ColumnMajor",
-    }
-
-    LAYOUT_TO_DISPATCHER = {
-        "r": "LayoutTag::RowMajor",
-        "c": "LayoutTag::ColMajor",
-    }
-
-    PIPELINE_TO_CK = {
-        "mem": "GemmPipelineAgBgCrMem",
-        "compv3": "GemmPipelineAgBgCrCompV3",
-        "compv4": "GemmPipelineAgBgCrCompV4",
-        "preshufflev2": "WeightPreshufflePipelineAGmemBGmemCRegV2",
-    }
-
-    PIPELINE_TO_BASE = {
-        "mem": "BaseGemmPipelineAgBgCrMem",
-        "compv3": "BaseGemmPipelineAgBgCrCompV3",
-        "compv4": "BaseGemmPipelineAgBgCrCompV4",
-        "preshufflev2": "BaseWeightPreshufflePipelineAGmemBGmemCRegV2",
-    }
-
-    PIPELINE_TO_DISPATCHER = {
-        "mem": "Pipeline::Mem",
-        "compv3": "Pipeline::CompV3",
-        "compv4": "Pipeline::CompV4",
-        "preshufflev2": "Pipeline::PreShuffleV2",
-    }
-
-    SCHEDULER_TO_CK = {
-        "intrawave": "GemmPipelineScheduler::Intrawave",
-        "interwave": "GemmPipelineScheduler::Interwave",
-        "default": "GemmPipelineScheduler::Default",
-    }
-
-    SCHEDULER_TO_DISPATCHER = {
-        "intrawave": "Scheduler::Intrawave",
-        "interwave": "Scheduler::Interwave",
-        "default": "Scheduler::Auto",
-    }
-
-    EPILOGUE_TO_DISPATCHER = {
-        "cshuffle": "Epilogue::CShuffle",
-        "default": "Epilogue::Default",
-    }
-
-    @staticmethod
-    def get_output_dtype(dtype: str) -> str:
-        """Get output datatype (fp8/bf8 -> fp16)"""
-        return "fp16" if dtype in ["fp8", "bf8"] else dtype
+# TypeMappings imported from codegen_common as CommonTypeMappings -> TypeMappings alias
 
 
 # ============================================================================
@@ -1068,7 +944,11 @@ class UnifiedGemmCodegen:
         }
 
     def generate_all(self, parallel: bool = True) -> Dict:
-        """Generate all kernels"""
+        """Generate all kernels.
+
+        When parallel=True, all configs across all variants are collected first,
+        then generated concurrently in a single thread pool for maximum throughput.
+        """
         log.info("Generating GEMM kernels:")
         log.info(f"  Datatype: {self.datatype}")
         log.info(f"  Layout: {self.layout}")
@@ -1078,49 +958,24 @@ class UnifiedGemmCodegen:
 
         results = {"kernels": [], "wrappers": [], "failed": []}
 
-        # Get configurations
+        # Collect ALL configs across all variants/preselected sets upfront
+        all_configs = []
         if self.use_preselected:
-            configs = self._get_preselected_configs()
-            log.info(f"  Total configurations: {len(configs)}")
+            all_configs = self._get_preselected_configs()
+            log.info(f"  Total configurations: {len(all_configs)}")
         else:
             for variant in self.variants:
-                log.info(f"\nGenerating {variant.value} kernels...")
                 configs = self._get_configs_for_variant(variant)
-                log.info(f"  Configurations: {len(configs)}")
+                log.info(f"  {variant.value}: {len(configs)} configurations")
+                all_configs.extend(configs)
+            log.info(f"  Total across all variants: {len(all_configs)}")
 
-                if parallel:
-                    with concurrent.futures.ThreadPoolExecutor() as executor:
-                        futures = [
-                            executor.submit(self._generate_one, cfg) for cfg in configs
-                        ]
-                        for future in concurrent.futures.as_completed(futures):
-                            try:
-                                k, w = future.result()
-                                results["kernels"].append(k)
-                                results["wrappers"].append(w)
-                            except Exception as e:
-                                results["failed"].append(str(e))
-                                log.error(f"Failed: {e}")
-                else:
-                    for cfg in configs:
-                        try:
-                            k, w = self._generate_one(cfg)
-                            results["kernels"].append(k)
-                            results["wrappers"].append(w)
-                        except Exception as e:
-                            results["failed"].append(str(e))
-                            log.error(f"Failed: {e}")
-
-            # Generate registration header
-            if results["wrappers"]:
-                self._generate_registration_header(results["wrappers"])
-
-            return results
-
-        # Generate from preselected set
-        if parallel:
+        # Generate all configs in a single parallel pass
+        if parallel and all_configs:
             with concurrent.futures.ThreadPoolExecutor() as executor:
-                futures = [executor.submit(self._generate_one, cfg) for cfg in configs]
+                futures = [
+                    executor.submit(self._generate_one, cfg) for cfg in all_configs
+                ]
                 for future in concurrent.futures.as_completed(futures):
                     try:
                         k, w = future.result()
@@ -1130,7 +985,7 @@ class UnifiedGemmCodegen:
                         results["failed"].append(str(e))
                         log.error(f"Failed: {e}")
         else:
-            for cfg in configs:
+            for cfg in all_configs:
                 try:
                     k, w = self._generate_one(cfg)
                     results["kernels"].append(k)
@@ -1139,7 +994,6 @@ class UnifiedGemmCodegen:
                     results["failed"].append(str(e))
                     log.error(f"Failed: {e}")
 
-        # Generate registration header
         if results["wrappers"]:
             self._generate_registration_header(results["wrappers"])
 
@@ -1638,12 +1492,19 @@ def main():
 
             # Write to temp file and use as config
             import tempfile
+            import os as _os
 
-            with tempfile.NamedTemporaryFile(
+            _tmp_config = tempfile.NamedTemporaryFile(
                 mode="w", suffix=".json", delete=False
-            ) as f:
-                json.dump(full_config, f)
-                args.config = Path(f.name)
+            )
+            try:
+                json.dump(full_config, _tmp_config)
+                _tmp_config.close()
+                args.config = Path(_tmp_config.name)
+            except Exception:
+                _tmp_config.close()
+                _os.unlink(_tmp_config.name)
+                raise
         except json.JSONDecodeError as e:
             logging.error(f"Invalid tile-config-json: {e}")
             return 1
@@ -1672,7 +1533,7 @@ def main():
 
     results = codegen.generate_all(parallel=not args.no_parallel)
 
-    logging.info("\n✅ Generation complete!")
+    logging.info("\nGeneration complete.")
     logging.info(f"  Kernels: {len(results['kernels'])}")
     logging.info(f"  Wrappers: {len(results['wrappers'])}")
     logging.info(f"  Failed: {len(results['failed'])}")
@@ -1684,7 +1545,7 @@ def main():
 
     # Generate dispatcher registration if requested
     if args.register:
-        logging.info("\n📝 Generating dispatcher registration code...")
+        logging.info("\nGenerating dispatcher registration code...")
         try:
             from generate_dispatcher_registration import (
                 scan_generated_headers,
@@ -1701,11 +1562,20 @@ def main():
             )
             generate_registration_cpp(kernels, reg_dir / "dispatcher_registration.cpp")
 
-            logging.info(f"✓ Generated registration code for {len(kernels)} kernels")
+            logging.info(f"Generated registration code for {len(kernels)} kernels")
         except Exception as e:
             logging.error(f"Failed to generate registration code: {e}")
             return 1
 
+    # Clean up temp config file if we created one
+    if args.tile_config_json and args.config and args.config.exists():
+        try:
+            import os as _os
+
+            _os.unlink(args.config)
+        except OSError:
+            pass
+
     return 0 if not results["failed"] else 1
 
 
diff --git a/dispatcher/codegen/unified_grouped_conv_codegen.py b/dispatcher/codegen/unified_grouped_conv_codegen.py
new file mode 100644
index 0000000000..ff40cb4ed4
--- /dev/null
+++ b/dispatcher/codegen/unified_grouped_conv_codegen.py
@@ -0,0 +1,1757 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Unified Grouped Convolution Code Generator
+
+This is the unified code generator for all grouped convolution kernel variants:
+- Forward grouped convolution
+- Backward data grouped convolution
+- Backward weight grouped convolution
+
+Generates both CK Tile kernels AND dispatcher wrappers.
+Based on the GEMM codegen pattern.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+from dataclasses import dataclass
+from enum import Enum
+
+from codegen_common import (
+    TileConfig,
+    TraitConfigBase,
+    parallel_generate,
+)
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+log = logging.getLogger(__name__)
+
+# Import architecture filter for GPU-specific validation
+try:
+    from arch_filter import ArchFilter, OperatorType
+
+    HAS_ARCH_FILTER = True
+except ImportError:
+    HAS_ARCH_FILTER = False
+    ArchFilter = None
+    OperatorType = None
+
+
+# ============================================================================
+# Configuration and Data Structures
+# ============================================================================
+
+
+class GroupedConvVariant(Enum):
+    """Grouped convolution kernel variants"""
+
+    FORWARD = "forward"
+    BACKWARD_DATA = "bwd_data"
+    BACKWARD_WEIGHT = "bwd_weight"
+
+
+class GroupedConvLayout(Enum):
+    """Grouped convolution data layouts"""
+
+    # 1D
+    NWGC = "NWGC"  # Input/Output: N W G C
+    GKXC = "GKXC"  # Weight: G K X C
+    NWGK = "NWGK"  # Output: N W G K
+
+    # 2D
+    NHWGC = "NHWGC"  # Input: N H W G C
+    GKYXC = "GKYXC"  # Weight: G K Y X C
+    NHWGK = "NHWGK"  # Output: N H W G K
+
+    # 3D
+    NDHWGC = "NDHWGC"  # Input: N D H W G C
+    GKZYXC = "GKZYXC"  # Weight: G K Z Y X C
+    NDHWGK = "NDHWGK"  # Output: N D H W G K
+
+
+@dataclass
+class GroupedConvTraitConfig(TraitConfigBase):
+    """Kernel trait configuration for grouped convolution (extends TraitConfigBase).
+
+    Conv-specific extensions beyond TraitConfigBase. These map to
+    GroupedConvTraits template parameters in grouped_convolution_utils.hpp:
+    - double_smem_buffer: ping-pong LDS for compute V4+ pipelines
+    - num_groups_to_merge: fuse multiple groups into one tile (NumGroupsToMerge)
+    - split_image: split spatial dims for large tensors (EnableSplitImage)
+    - explicit_gemm: use explicit GEMM path (ExplicitGemm)
+    - two_stage: two-stage bwd_weight with fp32 workspace + elementwise convert
+
+    Note: CK Tile already uses long_index_t (64-bit) for group strides and
+    batch offsets, so there is no separate "large_tensor" flag. For large
+    spatial dimensions, use split_image=True instead.
+    """
+
+    double_smem_buffer: bool = False
+    num_groups_to_merge: int = 1
+    split_image: bool = False
+    explicit_gemm: bool = False
+    two_stage: bool = False
+
+
+# Backward compatibility alias
+TraitConfig = GroupedConvTraitConfig
+
+
+@dataclass
+class GroupedConvKernelConfig:
+    """Complete grouped convolution kernel configuration"""
+
+    tile: TileConfig
+    trait: GroupedConvTraitConfig
+    variant: GroupedConvVariant = GroupedConvVariant.FORWARD
+    ndim_spatial: int = 2  # 1D, 2D, or 3D
+    arch: str = "gfx942"  # Target architecture
+    layout: Union[str, GroupedConvLayout] = (
+        "nhwgc"  # Data layout (e.g., "nhwgc", "ndhwgc")
+    )
+
+    # Vector sizes: a=4 for fp16 input (8-byte aligned global loads),
+    # b=8 for weight tensor, c=8 for output stores. These match the
+    # CK Tile default vectorization widths for fp16 on CDNA3 (gfx942).
+    vector_size_a: int = 4
+    vector_size_b: int = 8
+    vector_size_c: int = 8
+    vector_sizes: Optional[Tuple[int, int, int]] = None
+
+    # Occupancy parameters
+    block_per_cu: int = 1
+    num_wave_groups: int = 1
+    num_groups_to_merge: int = 1
+
+    # Double buffering
+    double_smem_buffer: bool = False
+
+    def __post_init__(self):
+        if self.vector_sizes is not None:
+            self.vector_size_a, self.vector_size_b, self.vector_size_c = (
+                self.vector_sizes[:3]
+            )
+        # Sync trait fields with top-level fields (trait is source of truth
+        # when both are specified, but top-level overrides default trait values).
+        if self.double_smem_buffer and not self.trait.double_smem_buffer:
+            self.trait.double_smem_buffer = self.double_smem_buffer
+        elif self.trait.double_smem_buffer:
+            self.double_smem_buffer = self.trait.double_smem_buffer
+        if self.num_groups_to_merge != 1 and self.trait.num_groups_to_merge == 1:
+            self.trait.num_groups_to_merge = self.num_groups_to_merge
+        elif self.trait.num_groups_to_merge != 1:
+            self.num_groups_to_merge = self.trait.num_groups_to_merge
+
+    def _layout_str(self) -> str:
+        """Get layout as lowercase string for naming."""
+        if hasattr(self.layout, "value"):
+            return self.layout.value.lower()
+        return str(self.layout).lower()
+
+    def name(self, datatype: str) -> str:
+        """
+        Generate kernel name that uniquely identifies the kernel configuration.
+
+        Format: grouped_conv_{variant}_{dtype}_{layout}_{ndim}d_{pipeline}_{epilogue}_{scheduler}
+                _{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}
+                _{warp_tile_m}x{warp_tile_n}x{warp_tile_k}
+                [_vec{a}_{b}_{c}][_bpc{n}][_wg{n}][_gm{n}][_dsb][_pad{mnk}]
+
+        All parameters that affect kernel behavior MUST be included to ensure
+        unique names for unique configurations:
+        - Variant (fwd/bwd_data/bwd_weight)
+        - Data type
+        - Layout (nhwgc, nchw, ndhwgc, etc.)
+        - Spatial dimensions (2d/3d)
+        - Pipeline, epilogue, scheduler
+        - Tile, warp, warp_tile dimensions
+        - Vector sizes, occupancy hints (if non-default)
+        - Double SMEM buffer, padding flags
+        """
+        t = self.tile
+        tr = self.trait
+        layout_str = self._layout_str()
+
+        variant_str = {
+            GroupedConvVariant.FORWARD: "fwd",
+            GroupedConvVariant.BACKWARD_DATA: "bwd_data",
+            GroupedConvVariant.BACKWARD_WEIGHT: "bwd_weight",
+        }[self.variant]
+
+        # Core identity: variant, dtype, layout, dims
+        name = (
+            f"grouped_conv_{variant_str}_{datatype}_{layout_str}_{self.ndim_spatial}d"
+        )
+
+        # Pipeline configuration
+        name += f"_{tr.pipeline}_{tr.epilogue}_{tr.scheduler}"
+
+        # Block tile dimensions (M_Tile x N_Tile x K_Tile)
+        name += f"_{t.tile_m}x{t.tile_n}x{t.tile_k}"
+
+        # Wave distribution (M_Warp x N_Warp x K_Warp)
+        name += f"_{t.warp_m}x{t.warp_n}x{t.warp_k}"
+
+        # Warp tile dimensions (M_Warp_Tile x N_Warp_Tile x K_Warp_Tile)
+        name += f"_{t.warp_tile_m}x{t.warp_tile_n}x{t.warp_tile_k}"
+
+        # Vector sizes (only if non-default)
+        if (self.vector_size_a, self.vector_size_b, self.vector_size_c) != (4, 8, 8):
+            name += (
+                f"_vec{self.vector_size_a}_{self.vector_size_b}_{self.vector_size_c}"
+            )
+
+        # Occupancy hints (only if non-default)
+        if self.block_per_cu != 1:
+            name += f"_bpc{self.block_per_cu}"
+
+        if self.num_wave_groups != 1:
+            name += f"_wg{self.num_wave_groups}"
+
+        if self.num_groups_to_merge != 1:
+            name += f"_gm{self.num_groups_to_merge}"
+
+        # Double SMEM buffer (for compute V4+)
+        if self.double_smem_buffer or tr.double_smem_buffer:
+            name += "_dsb"
+
+        # Two-stage bwd_weight (fp32 workspace + elementwise convert)
+        if tr.two_stage:
+            name += "_2stage"
+
+        # Padding suffix (only if not all enabled)
+        if not (tr.pad_m and tr.pad_n and tr.pad_k):
+            name += f"_pad{int(tr.pad_m)}{int(tr.pad_n)}{int(tr.pad_k)}"
+
+        return name
+
+    def is_valid_for_arch(self, arch: Optional[str] = None) -> bool:
+        """Check if configuration is valid for target architecture"""
+        target_arch = arch if arch is not None else self.arch
+
+        # Check trait validity
+        if not self.trait.is_valid():
+            return False
+
+        # Backward operations have stricter pipeline requirements:
+        # - Backward weight: compv4/compv5 have transpose_tile2d issues
+        # - Backward data: compv4 has get_length issues in bwd_data kernel
+        # Both backward operations ONLY support compv3 and mem pipelines
+        if self.variant in (
+            GroupedConvVariant.BACKWARD_WEIGHT,
+            GroupedConvVariant.BACKWARD_DATA,
+        ):
+            if self.trait.pipeline not in ("compv3", "mem"):
+                return False
+
+        # Check warp configuration (from arch_specs)
+        try:
+            from arch_specs_generated import WARP_SUPPORTED_COMBINATIONS
+
+            supported = WARP_SUPPORTED_COMBINATIONS.get(target_arch)
+            if supported is None:
+                return False  # Unknown architecture
+            warp_cfg = [self.tile.warp_m, self.tile.warp_n, self.tile.warp_k]
+            if warp_cfg not in supported:
+                return False
+        except ImportError:
+            pass  # Allow if arch_specs not available
+
+        return True
+
+
+# ============================================================================
+# Type Mappings
+# ============================================================================
+
+
+class GroupedConvTypeMappings:
+    """Centralized type mappings for grouped convolution code generation"""
+
+    DTYPE_TO_CK = {
+        "fp16": "half_t",
+        "bf16": "bf16_t",
+        "fp32": "float",
+    }
+
+    # CK Tile conv pipelines (from conv_configs.hpp PipelineTypeTraits).
+    # basic_v1/mem/compv3 use GroupedConvUniversalPipelineAgBgCrPolicy;
+    # compv4/compv5/compv6/comp_async/basic_async_v1 use their own default policy.
+    PIPELINE_TO_CK = {
+        "basic_v1": "GemmPipeline::BASIC_V1",
+        "mem": "GemmPipeline::MEMORY",
+        "compv3": "GemmPipeline::COMPUTE_V3",
+        "compv4": "GemmPipeline::COMPUTE_V4",
+        "compv5": "GemmPipeline::COMPUTE_V5",
+        "compv6": "GemmPipeline::COMPUTE_V6",
+        "comp_async": "GemmPipeline::COMPUTE_ASYNC",
+        "basic_async_v1": "GemmPipeline::BASIC_ASYNC_V1",
+    }
+
+    SCHEDULER_TO_CK = {
+        "intrawave": "GemmPipelineScheduler::Intrawave",
+        "interwave": "GemmPipelineScheduler::Interwave",
+    }
+
+    LAYOUT_1D = {
+        "in": "tensor_layout::convolution::NWGC",
+        "wei": "tensor_layout::convolution::GKXC",
+        "out": "tensor_layout::convolution::NWGK",
+    }
+
+    LAYOUT_2D = {
+        "in": "tensor_layout::convolution::NHWGC",
+        "wei": "tensor_layout::convolution::GKYXC",
+        "out": "tensor_layout::convolution::NHWGK",
+    }
+
+    LAYOUT_3D = {
+        "in": "tensor_layout::convolution::NDHWGC",
+        "wei": "tensor_layout::convolution::GKZYXC",
+        "out": "tensor_layout::convolution::NDHWGK",
+    }
+
+    @classmethod
+    def get_layouts(cls, ndim: int) -> dict:
+        if ndim == 1:
+            return cls.LAYOUT_1D
+        elif ndim == 2:
+            return cls.LAYOUT_2D
+        else:
+            return cls.LAYOUT_3D
+
+
+# ============================================================================
+# CK Tile Grouped Conv Kernel Generator
+# ============================================================================
+
+
+class CKTileGroupedConvKernelGenerator:
+    """Generates CK Tile grouped convolution kernel instance code"""
+
+    def __init__(
+        self,
+        datatype: str,
+        variant: GroupedConvVariant = GroupedConvVariant.FORWARD,
+    ):
+        self.datatype = datatype
+        self.variant = variant
+        self.tm = GroupedConvTypeMappings()
+
+    def generate(self, config: GroupedConvKernelConfig) -> str:
+        """Generate complete CK Tile grouped convolution kernel"""
+        kernel_name = config.name(self.datatype)
+        return f"""{self._header(kernel_name, config)}
+{self._config_struct(config, kernel_name)}
+{self._kernel_instance(config, kernel_name)}
+"""
+
+    def _header(self, kernel_name: str, config: GroupedConvKernelConfig) -> str:
+        """Generate header includes based on variant"""
+        if self.variant == GroupedConvVariant.BACKWARD_DATA:
+            kernel_header = "grouped_convolution_backward_data_kernel.hpp"
+        elif self.variant == GroupedConvVariant.BACKWARD_WEIGHT:
+            kernel_header = "grouped_convolution_backward_weight_kernel.hpp"
+        else:
+            kernel_header = "grouped_convolution_forward_kernel.hpp"
+
+        elementwise_include = ""
+        if config.trait.two_stage:
+            elementwise_include = '\n#include "ck_tile/ops/elementwise.hpp"'
+
+        return f"""// SPDX-License-Identifier: MIT
+// Auto-generated CK Tile Grouped Convolution kernel: {kernel_name}
+// Variant: {self.variant.value}
+#pragma once
+
+#include <cstdint>
+#include <numeric>
+#include <functional>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/grouped_convolution/kernel/{kernel_header}"
+#include "ck_tile/ops/grouped_convolution/pipeline/grouped_conv_universal_pipeline_ag_bg_cr_policy.hpp"{elementwise_include}
+
+using namespace ck_tile;
+"""
+
+    def _config_struct(self, config: GroupedConvKernelConfig, kernel_name: str) -> str:
+        """Generate config struct"""
+        t = config.tile
+        tr = config.trait
+        layouts = self.tm.get_layouts(config.ndim_spatial)
+
+        return f"""
+// Kernel configuration
+struct {kernel_name}_Config {{
+    // Data types
+    using InDataType = {self.tm.DTYPE_TO_CK[self.datatype]};
+    using WeiDataType = {self.tm.DTYPE_TO_CK[self.datatype]};
+    using AccDataType = float;
+    using OutDataType = {self.tm.DTYPE_TO_CK[self.datatype]};
+    
+    // Layouts
+    using InLayout = {layouts["in"]};
+    using WeiLayout = {layouts["wei"]};
+    using OutLayout = {layouts["out"]};
+    
+    // Tile shape
+    static constexpr index_t M_Tile = {t.tile_m};
+    static constexpr index_t N_Tile = {t.tile_n};
+    static constexpr index_t K_Tile = {t.tile_k};
+    
+    static constexpr index_t M_Warp = {t.warp_m};
+    static constexpr index_t N_Warp = {t.warp_n};
+    static constexpr index_t K_Warp = {t.warp_k};
+    
+    static constexpr index_t M_Warp_Tile = {t.warp_tile_m};
+    static constexpr index_t N_Warp_Tile = {t.warp_tile_n};
+    static constexpr index_t K_Warp_Tile = {t.warp_tile_k};
+    
+    // Vector sizes
+    static constexpr index_t VectorSizeA = {config.vector_size_a};
+    static constexpr index_t VectorSizeB = {config.vector_size_b};
+    static constexpr index_t VectorSizeC = {config.vector_size_c};
+    
+    // Padding
+    static constexpr bool kPadM = {str(tr.pad_m).lower()};
+    static constexpr bool kPadN = {str(tr.pad_n).lower()};
+    static constexpr bool kPadK = {str(tr.pad_k).lower()};
+    
+    // Pipeline & Epilogue
+    static constexpr auto Pipeline = {self.tm.PIPELINE_TO_CK[tr.pipeline]};
+    static constexpr auto Scheduler = {self.tm.SCHEDULER_TO_CK[tr.scheduler]};
+    static constexpr bool DoubleSmemBuffer = {str(tr.double_smem_buffer).lower()};
+    static constexpr bool UseCShuffleEpilogue = {str(tr.epilogue == "cshuffle").lower()};
+    
+    // Other params
+    static constexpr int kBlockPerCu = {config.block_per_cu};
+    static constexpr index_t NumWaveGroups = {config.num_wave_groups};
+    static constexpr index_t NumGroupsToMerge = {tr.num_groups_to_merge};
+    static constexpr bool EnableSplitImage = {str(tr.split_image).lower()};
+    static constexpr bool ExplicitGemm = {str(tr.explicit_gemm).lower()};
+    static constexpr index_t NDimSpatial = {config.ndim_spatial};
+    
+    // Target architecture
+    static constexpr const char* TargetArch = "{config.arch}";
+}};
+"""
+
+    def _kernel_instance(
+        self, config: GroupedConvKernelConfig, kernel_name: str
+    ) -> str:
+        """Generate kernel instantiation code with launch function"""
+        tr = config.trait
+
+        if self.variant == GroupedConvVariant.BACKWARD_WEIGHT and tr.two_stage:
+            return self._kernel_instance_two_stage(config, kernel_name)
+
+        # Variant-specific configuration
+        if self.variant == GroupedConvVariant.BACKWARD_DATA:
+            host_args_type = "GroupedConvBwdDataHostArgs"
+            kernel_type = "GroupedConvolutionBackwardDataKernel"
+            gemm_traits = "GroupedConvImplicitGemmTraitsBwdData"
+            layout_suffix = "BwdData"
+            # For bwd_data: A=dOutput, B=Weight, C=dInput
+            a_dtype = "OutDataType"
+            b_dtype = "WeiDataType"
+            c_dtype = "InDataType"
+            gemm_k_calc = "args.K_ * std::accumulate(args.filter_spatial_lengths_.begin(), args.filter_spatial_lengths_.end()"
+            direction_prefix = "BWD_DATA"
+            launcher_alias = "SelectedConvBwdDataLauncher"
+        elif self.variant == GroupedConvVariant.BACKWARD_WEIGHT:
+            host_args_type = "GroupedConvBwdWeightHostArgs"
+            kernel_type = "GroupedConvolutionBackwardWeightKernel"
+            gemm_traits = "GroupedConvImplicitGemmTraitsBwdWeight"
+            layout_suffix = "BwdWeight"
+            # For bwd_weight: A=dOutput, B=Input, C=dWeight (per CK Tile invoker)
+            a_dtype = "OutDataType"
+            b_dtype = "InDataType"
+            c_dtype = "WeiDataType"
+            gemm_k_calc = "args.N_ * std::accumulate(args.output_spatial_lengths_.begin(), args.output_spatial_lengths_.end()"
+            direction_prefix = "BWD_WEIGHT"
+            launcher_alias = "SelectedConvBwdWeightLauncher"
+        else:  # Forward
+            host_args_type = "GroupedConvFwdHostArgs<>"
+            kernel_type = "GroupedConvolutionForwardKernel"
+            gemm_traits = "GroupedConvImplicitGemmTraitsFwd"
+            layout_suffix = "Fwd"
+            a_dtype = "InDataType"
+            b_dtype = "WeiDataType"
+            c_dtype = "OutDataType"
+            gemm_k_calc = "args.C_ * std::accumulate(args.filter_spatial_lengths_.begin(), args.filter_spatial_lengths_.end()"
+            direction_prefix = "FWD"
+            launcher_alias = "SelectedConvKernelLauncher"
+
+        # Create valid C++ namespace name
+        ns_name = "ns_" + kernel_name.replace("-", "_")
+
+        return f"""
+// Unique namespace for this kernel to avoid conflicts when including multiple kernels
+namespace {ns_name} {{
+
+// Bring Config into namespace
+using Config = {kernel_name}_Config;
+
+// Kernel name for identification
+constexpr const char* CONV_{direction_prefix}_KERNEL_NAME = "{kernel_name}";
+
+// Selected kernel alias
+using SelectedConv{direction_prefix.title()}Kernel = Config;
+
+// =============================================================================
+// Kernel Launch Implementation ({self.variant.value})
+// =============================================================================
+
+struct {kernel_name}_Launcher {{
+    using KernelConfig = Config; // Use the Config alias from namespace
+    using InDataType = typename Config::InDataType;
+    using WeiDataType = typename Config::WeiDataType;
+    using OutDataType = typename Config::OutDataType;
+    using AccDataType = typename Config::AccDataType;
+    using InLayout = typename Config::InLayout;
+    using WeiLayout = typename Config::WeiLayout;
+    using OutLayout = typename Config::OutLayout;
+    
+    static constexpr index_t NDimSpatial = Config::NDimSpatial;
+    
+    // Implicit GEMM shape
+    using GemmShape = TileGemmShape<
+        sequence<Config::M_Tile, Config::N_Tile, Config::K_Tile>,
+        sequence<Config::M_Warp, Config::N_Warp, Config::K_Warp>,
+        sequence<Config::M_Warp_Tile, Config::N_Warp_Tile, Config::K_Warp_Tile>>;
+    
+    // Convolution traits
+    static constexpr auto ConvSpec = ConvolutionSpecialization::Default;
+    using GroupedConvTraitsType = GroupedConvTraits<
+        NDimSpatial, ConvSpec, InLayout, WeiLayout, tuple<>, OutLayout,
+        Config::VectorSizeA, Config::VectorSizeB, Config::VectorSizeC,
+        Config::NumGroupsToMerge, Config::EnableSplitImage, Config::ExplicitGemm>;
+    
+    // Tile partitioner
+    using TilePartitioner = GemmSpatiallyLocalTilePartitioner<
+        GemmShape,
+        GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
+        GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
+    
+    // Universal traits - layout suffix changes per variant
+    using GemmUniversalTraits = TileGemmUniversalTraits<
+        GroupedConvTraitsType::FixedGemmParams::kPadM,
+        GroupedConvTraitsType::FixedGemmParams::kPadN,
+        GroupedConvTraitsType::FixedGemmParams::kPadK,
+        Config::DoubleSmemBuffer,
+        typename GroupedConvTraitsType::AsLayout{layout_suffix},
+        typename GroupedConvTraitsType::BsLayout{layout_suffix},
+        typename GroupedConvTraitsType::CLayout{layout_suffix},
+        GroupedConvTraitsType::FixedGemmParams::TransposeC,
+        GroupedConvTraitsType::FixedGemmParams::UseStructuredSparsity,
+        GroupedConvTraitsType::FixedGemmParams::Persistent,
+        Config::NumWaveGroups>;
+    
+    // Pipeline problem - data types change per variant
+    using GemmPipelineProblem = GemmPipelineProblem<
+        {a_dtype}, {b_dtype}, AccDataType, GemmShape,
+        typename GroupedConvTraitsType::template {gemm_traits}<Config::NumWaveGroups>,
+        element_wise::PassThrough, element_wise::PassThrough, {c_dtype},
+        GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+        GroupedConvTraitsType::VectorSizeA, GroupedConvTraitsType::VectorSizeB>;
+    
+    // Base pipeline for tail handling
+    using BaseGemmPipeline = {self._get_base_pipeline(tr.pipeline)}<GemmPipelineProblem>;
+    
+    static float launch(const {host_args_type}& args, const stream_config& s) {{
+        const index_t gemm_k = {gemm_k_calc}, 1, std::multiplies<index_t>());
+        
+        const index_t k_grain = args.k_batch * Config::K_Tile;
+        const index_t K_split = (gemm_k + k_grain - 1) / k_grain * Config::K_Tile;
+        const index_t num_loop = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+        
+        float ave_time{{0}};
+        
+        constexpr auto scheduler = Config::Scheduler;
+        
+        using UniversalGemmProblem = UniversalGemmPipelineProblem<
+            {a_dtype}, {b_dtype}, AccDataType, GemmShape, GemmUniversalTraits,
+            scheduler,
+            element_wise::PassThrough, element_wise::PassThrough, {c_dtype},
+            GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+            GroupedConvTraitsType::VectorSizeA, GroupedConvTraitsType::VectorSizeB>;
+        
+        using GemmPipeline = {self._get_pipeline_template_args(tr.pipeline, "UniversalGemmProblem")};
+        
+        using ConvEpilogue = CShuffleEpilogue<CShuffleEpilogueProblem<
+            {a_dtype}, {b_dtype}, tuple<>, AccDataType, {c_dtype},
+            typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+            typename GroupedConvTraitsType::FixedGemmParams::ELayout,
+            element_wise::PassThrough,
+            TilePartitioner::MPerBlock, TilePartitioner::NPerBlock,
+            Config::M_Warp, Config::N_Warp, Config::M_Warp_Tile, 
+            Config::N_Warp_Tile, Config::K_Warp_Tile,
+            GroupedConvTraitsType::FixedGemmParams::TransposeC,
+            Config::NumWaveGroups,
+            GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+            Config::VectorSizeC, false, 1, Config::DoubleSmemBuffer>>;
+        
+        using Kernel = {kernel_type}<
+            GroupedConvTraitsType, TilePartitioner, GemmPipeline, ConvEpilogue>;
+        
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {{
+            auto kargs = Kernel::MakeKernelArgs(args);
+            
+            if (!Kernel::IsSupportedArgument(kargs)) {{
+                throw std::runtime_error("Arguments not supported for grouped conv kernel");
+            }}
+            
+            const dim3 grids = Kernel::GridSize(kargs);
+            const dim3 blocks = Kernel::BlockSize();
+            
+            ave_time = launch_kernel(s, make_kernel<Config::kBlockPerCu>(
+                Kernel{{}}, grids, blocks, 0, kargs));
+            
+            return ave_time;
+        }};
+        
+        BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+        return ave_time;
+    }}
+}};
+
+// Launcher alias for tile_engine compatibility
+using {launcher_alias} = {kernel_name}_Launcher;
+
+}} // namespace {ns_name}
+
+// Export specific launcher to global namespace
+using {kernel_name}_Launcher = {ns_name}::{kernel_name}_Launcher;
+
+// When used with -include compiler flag, export aliases to global namespace
+#ifdef CK_TILE_SINGLE_KERNEL_INCLUDE
+using {launcher_alias} = {ns_name}::{launcher_alias};
+constexpr const char* CONV_{direction_prefix}_KERNEL_NAME = {ns_name}::CONV_{direction_prefix}_KERNEL_NAME;
+#endif
+"""
+
+    # Pipelines that accept GroupedConvUniversalPipelineAgBgCrPolicy
+    # as a second template parameter for conv-specific LDS layout.
+    # (from conv_configs.hpp PipelineTypeTraits -- basic_v1/mem/compv3)
+    # CompV4/V5/V6/comp_async/basic_async_v1 use their own default policies.
+    _CONV_POLICY_PIPELINES = {"basic_v1", "mem", "compv3"}
+
+    def _get_pipeline(self, pipeline: str) -> str:
+        """Get pipeline class name."""
+        pipelines = {
+            "basic_v1": "GemmPipelineAGmemBGmemCRegV1",
+            "mem": "GemmPipelineAgBgCrMem",
+            "compv3": "GemmPipelineAgBgCrCompV3",
+            "compv4": "GemmPipelineAgBgCrCompV4",
+            "compv5": "GemmPipelineAgBgCrCompV5",
+            "compv6": "GemmPipelineAgBgCrCompV6",
+            "comp_async": "GemmPipelineAgBgCrCompAsync",
+            "basic_async_v1": "GemmPipelineAGmemBGmemCRegAsyncV1",
+        }
+        return pipelines.get(pipeline, "GemmPipelineAgBgCrCompV3")
+
+    def _get_pipeline_template_args(self, pipeline: str, problem_type: str) -> str:
+        """Get full template argument list for pipeline instantiation.
+
+        For basic_v1/mem/compv3, passes GroupedConvUniversalPipelineAgBgCrPolicy
+        as a second template argument for conv-specific LDS banking.
+        """
+        base = self._get_pipeline(pipeline)
+        if pipeline in self._CONV_POLICY_PIPELINES:
+            return f"{base}<{problem_type}, GroupedConvUniversalPipelineAgBgCrPolicy>"
+        return f"{base}<{problem_type}>"
+
+    def _get_base_pipeline(self, pipeline: str) -> str:
+        """Get base pipeline class name (used for tail handling only).
+
+        Note: basic_async_v1 inherits from BaseGemmPipelineAGmemBGmemCRegV1
+        (there is no separate BaseGemmPipelineAGmemBGmemCRegAsyncV1).
+        """
+        pipelines = {
+            "basic_v1": "BaseGemmPipelineAGmemBGmemCRegV1",
+            "mem": "BaseGemmPipelineAgBgCrMem",
+            "compv3": "BaseGemmPipelineAgBgCrCompV3",
+            "compv4": "BaseGemmPipelineAgBgCrCompV4",
+            "compv5": "BaseGemmPipelineAgBgCrCompV5",
+            "compv6": "BaseGemmPipelineAgBgCrCompV6",
+            "comp_async": "BaseGemmPipelineAgBgCrCompAsync",
+            "basic_async_v1": "BaseGemmPipelineAGmemBGmemCRegV1",
+        }
+        return pipelines.get(pipeline, "BaseGemmPipelineAgBgCrCompV3")
+
+    def _kernel_instance_two_stage(
+        self, config: GroupedConvKernelConfig, kernel_name: str
+    ) -> str:
+        """Generate two-stage bwd_weight kernel: GEMM into fp32 workspace + ElementWise convert.
+
+        Mirrors grouped_convolution_backward_weight_two_stage_invoker.hpp from
+        example/ck_tile/20_grouped_convolution/.
+        """
+        tr = config.trait
+        ns_name = "ns_" + kernel_name.replace("-", "_")
+        direction_prefix = "BWD_WEIGHT"
+        launcher_alias = "SelectedConvBwdWeightLauncher"
+
+        return f"""
+namespace {ns_name} {{
+
+using Config = {kernel_name}_Config;
+constexpr const char* CONV_{direction_prefix}_KERNEL_NAME = "{kernel_name}";
+using SelectedConv{direction_prefix.title()}Kernel = Config;
+
+struct {kernel_name}_Launcher {{
+    using KernelConfig = Config;
+    using InDataType = typename Config::InDataType;
+    using WeiDataType = typename Config::WeiDataType;
+    using OutDataType = typename Config::OutDataType;
+    using AccDataType = typename Config::AccDataType;
+    using InLayout = typename Config::InLayout;
+    using WeiLayout = typename Config::WeiLayout;
+    using OutLayout = typename Config::OutLayout;
+    using WorkspaceDataType = float;
+
+    static constexpr index_t NDimSpatial = Config::NDimSpatial;
+    // Two-stage forces VectorSizeC = 1 for workspace writes
+    static constexpr index_t VectorSizeC_TwoStage = 1;
+
+    using GemmShape = TileGemmShape<
+        sequence<Config::M_Tile, Config::N_Tile, Config::K_Tile>,
+        sequence<Config::M_Warp, Config::N_Warp, Config::K_Warp>,
+        sequence<Config::M_Warp_Tile, Config::N_Warp_Tile, Config::K_Warp_Tile>>;
+
+    static constexpr auto ConvSpec = ConvolutionSpecialization::Default;
+    using GroupedConvTraitsType = GroupedConvTraits<
+        NDimSpatial, ConvSpec, InLayout, WeiLayout, tuple<>, OutLayout,
+        Config::VectorSizeA, Config::VectorSizeB, VectorSizeC_TwoStage,
+        Config::NumGroupsToMerge, Config::EnableSplitImage, Config::ExplicitGemm>;
+
+    using TilePartitioner = GemmSpatiallyLocalTilePartitioner<
+        GemmShape,
+        GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
+        GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
+
+    using GemmUniversalTraits = TileGemmUniversalTraits<
+        GroupedConvTraitsType::FixedGemmParams::kPadM,
+        GroupedConvTraitsType::FixedGemmParams::kPadN,
+        GroupedConvTraitsType::FixedGemmParams::kPadK,
+        Config::DoubleSmemBuffer,
+        typename GroupedConvTraitsType::AsLayoutBwdWeight,
+        typename GroupedConvTraitsType::BsLayoutBwdWeight,
+        typename GroupedConvTraitsType::CLayoutBwdWeight,
+        GroupedConvTraitsType::FixedGemmParams::TransposeC,
+        GroupedConvTraitsType::FixedGemmParams::UseStructuredSparsity,
+        GroupedConvTraitsType::FixedGemmParams::Persistent,
+        Config::NumWaveGroups>;
+
+    using GemmPipelineProblem = GemmPipelineProblem<
+        OutDataType, InDataType, AccDataType, GemmShape,
+        typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdWeight<Config::NumWaveGroups>,
+        element_wise::PassThrough, element_wise::PassThrough, WeiDataType,
+        GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+        GroupedConvTraitsType::VectorSizeA, GroupedConvTraitsType::VectorSizeB>;
+
+    using BaseGemmPipeline = {self._get_base_pipeline(tr.pipeline)}<GemmPipelineProblem>;
+
+    static float launch(const GroupedConvBwdWeightHostArgs& args, const stream_config& s) {{
+        const index_t gemm_k = args.N_ * std::accumulate(
+            args.output_spatial_lengths_.begin(), args.output_spatial_lengths_.end(),
+            1, std::multiplies<index_t>());
+
+        const index_t k_grain = args.k_batch * Config::K_Tile;
+        const index_t K_split = (gemm_k + k_grain - 1) / k_grain * Config::K_Tile;
+        const index_t num_loop = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        float ave_time{{0}};
+
+        constexpr auto scheduler = Config::Scheduler;
+
+        using UniversalGemmProblem = UniversalGemmPipelineProblem<
+            OutDataType, InDataType, AccDataType, GemmShape, GemmUniversalTraits,
+            scheduler,
+            element_wise::PassThrough, element_wise::PassThrough, WeiDataType,
+            GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+            GroupedConvTraitsType::VectorSizeA, GroupedConvTraitsType::VectorSizeB>;
+
+        using GemmPipeline = {self._get_pipeline_template_args(tr.pipeline, "UniversalGemmProblem")};
+
+        // Epilogue writes to fp32 workspace (not fp16 output)
+        using ConvEpilogue = CShuffleEpilogue<CShuffleEpilogueProblem<
+            OutDataType, InDataType, tuple<>, AccDataType, WorkspaceDataType,
+            typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+            typename GroupedConvTraitsType::FixedGemmParams::ELayout,
+            element_wise::PassThrough,
+            TilePartitioner::MPerBlock, TilePartitioner::NPerBlock,
+            Config::M_Warp, Config::N_Warp, Config::M_Warp_Tile,
+            Config::N_Warp_Tile, Config::K_Warp_Tile,
+            GroupedConvTraitsType::FixedGemmParams::TransposeC,
+            Config::NumWaveGroups,
+            GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+            GroupedConvTraitsType::VectorSizeC>>;
+
+        using Kernel = GroupedConvolutionBackwardWeightKernel<
+            GroupedConvTraitsType, TilePartitioner, GemmPipeline, ConvEpilogue>;
+
+        // ElementWise kernel: fp32 workspace -> fp16/bf16 output
+        using XElementwiseOp = element_wise::UnaryConvert;
+        using EwBlockTile = sequence<2048>;
+        using EwBlockWarps = sequence<8>;
+        using EwWarpTile = sequence<64>;
+        using EwShape = ElementWiseShape<EwBlockWarps, EwBlockTile, EwWarpTile, WorkspaceDataType>;
+        using EwProblem = ElementWisePipelineProblem<
+            WorkspaceDataType, WorkspaceDataType, WeiDataType, EwShape, XElementwiseOp>;
+        using EwKernel = ElementWiseKernel<EwProblem, ElementWiseDefaultPolicy>;
+
+        // Workspace: G * K * C * product(filter_spatial) elements in fp32
+        const index_t spatial_accum = std::accumulate(
+            args.filter_spatial_lengths_.begin(), args.filter_spatial_lengths_.end(),
+            1, std::multiplies<index_t>());
+        DeviceMem ws_buf(args.G_ * args.K_ * args.C_ * spatial_accum * sizeof(WorkspaceDataType));
+
+        GroupedConvBwdWeightHostArgs ws_args(args);
+        auto* c_ptr = ws_args.wei_ptr;
+        ws_args.wei_ptr = ws_buf.GetDeviceBuffer();
+
+        auto kargs = Kernel::MakeKernelArgs(ws_args);
+
+        if(!Kernel::IsSupportedArgument(kargs)) {{
+            throw std::runtime_error("Arguments not supported for two-stage bwd_weight kernel");
+        }}
+
+        const dim3 grids = Kernel::GridSize(kargs);
+        const dim3 blocks = Kernel::BlockSize();
+
+        // ElementWise kernel setup
+        const index_t ew_block_size = EwKernel::BlockSize();
+        const index_t total_elems = args.G_ * args.K_ * args.C_ * spatial_accum;
+        constexpr index_t elems_per_block = EwBlockTile::at(number<0>{{}});
+        const index_t ew_grid_size = (total_elems + elems_per_block - 1) / elems_per_block;
+
+        auto ew_shape = make_tuple(args.G_ * args.K_,
+                                   args.C_ * spatial_accum);
+        auto ew_inputs = make_tuple(static_cast<WorkspaceDataType*>(ws_args.wei_ptr));
+
+        if(!EwKernel::IsSupportedArgument(ew_shape)) {{
+            throw std::runtime_error("ElementWise arguments not supported for two-stage convert");
+        }}
+
+        auto preprocess = [&]() {{
+            if(kargs.k_batch > 1)
+                hip_check_error(hipMemsetAsync(
+                    ws_args.wei_ptr, 0,
+                    total_elems * sizeof(WorkspaceDataType),
+                    s.stream_id_));
+        }};
+
+        ave_time = launch_kernel_time_mask(
+            s, preprocess,
+            make_kernel<Config::kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs),
+            make_kernel<Config::kBlockPerCu>(
+                EwKernel{{}}, ew_grid_size, ew_block_size, 0,
+                ew_shape,
+                make_tuple(args.C_ * spatial_accum, 1),
+                make_tuple(args.C_ * spatial_accum, 1),
+                ew_inputs,
+                static_cast<WeiDataType*>(c_ptr)));
+
+        return ave_time;
+    }}
+}};
+
+using {launcher_alias} = {kernel_name}_Launcher;
+
+}} // namespace {ns_name}
+
+using {kernel_name}_Launcher = {ns_name}::{kernel_name}_Launcher;
+
+#ifdef CK_TILE_SINGLE_KERNEL_INCLUDE
+using {launcher_alias} = {ns_name}::{launcher_alias};
+constexpr const char* CONV_{direction_prefix}_KERNEL_NAME = {ns_name}::CONV_{direction_prefix}_KERNEL_NAME;
+#endif
+"""
+
+
+# ============================================================================
+# Dispatcher Wrapper Generator
+# ============================================================================
+
+
+class GroupedConvDispatcherWrapperGenerator:
+    """Generates dispatcher integration wrapper following GEMM pattern"""
+
+    # Static mappings for pipeline and scheduler enum names (matches kernel_key.hpp)
+    PIPELINE_TO_DISPATCHER = {
+        "mem": "Pipeline::Mem",
+        "compv3": "Pipeline::CompV3",
+        "compv4": "Pipeline::CompV4",
+        "compv5": "Pipeline::CompV5",
+        "preshufflev1": "Pipeline::PreShuffleV1",
+        "preshufflev2": "Pipeline::PreShuffleV2",
+    }
+
+    SCHEDULER_TO_DISPATCHER = {
+        "default": "Scheduler::Default",
+        "intrawave": "Scheduler::Intrawave",
+        "interwave": "Scheduler::Interwave",
+    }
+
+    def __init__(
+        self,
+        datatype: str,
+        variant: GroupedConvVariant = GroupedConvVariant.FORWARD,
+    ):
+        self.datatype = datatype
+        self.variant = variant
+
+    def _pipeline_to_dispatcher(self, pipeline: str) -> str:
+        """Convert pipeline string to dispatcher enum value"""
+        return self.PIPELINE_TO_DISPATCHER.get(
+            pipeline.lower(), f"Pipeline::{pipeline.capitalize()}"
+        )
+
+    def _scheduler_to_dispatcher(self, scheduler: str) -> str:
+        """Convert scheduler string to dispatcher enum value"""
+        return self.SCHEDULER_TO_DISPATCHER.get(
+            scheduler.lower(), f"Scheduler::{scheduler.capitalize()}"
+        )
+
+    def generate(
+        self,
+        config: GroupedConvKernelConfig,
+        kernel_path: Path,
+        output_dir: Path,
+    ) -> str:
+        """Generate dispatcher wrapper with factory function for registry"""
+        kernel_name = config.name(self.datatype)
+        rel_path = kernel_path.relative_to(output_dir)
+
+        # Determine launcher type based on variant
+        if self.variant == GroupedConvVariant.FORWARD:
+            launcher_alias = "SelectedConvKernelLauncher"
+            host_args_type = "GroupedConvFwdHostArgs<>"
+            conv_type_str = "forward"
+        elif self.variant == GroupedConvVariant.BACKWARD_DATA:
+            launcher_alias = "SelectedConvBwdDataLauncher"
+            host_args_type = "GroupedConvBwdDataHostArgs"
+            conv_type_str = "bwd_data"
+        else:  # BACKWARD_WEIGHT
+            launcher_alias = "SelectedConvBwdWeightLauncher"
+            host_args_type = "GroupedConvBwdWeightHostArgs"
+            conv_type_str = "bwd_weight"
+
+        return f"""// SPDX-License-Identifier: MIT
+// Auto-generated dispatcher wrapper for: {kernel_name}
+#pragma once
+
+#include "ck_tile/dispatcher.hpp"
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+#include "../{rel_path}"
+
+namespace ck_tile {{
+namespace dispatcher {{
+namespace generated {{
+
+using ::ck_tile::dispatcher::GroupedConvKernelInstancePtr;
+using ::ck_tile::dispatcher::GroupedConvKernelKey;
+using ::ck_tile::dispatcher::DataType;
+using ::ck_tile::dispatcher::LayoutTag;
+using ::ck_tile::dispatcher::Pipeline;
+using ::ck_tile::dispatcher::Scheduler;
+using ::ck_tile::dispatcher::Epilogue;
+using Priority = ::ck_tile::dispatcher::GroupedConvRegistry::Priority;
+
+// Factory function to create kernel instance for registry
+inline GroupedConvKernelInstancePtr make_{kernel_name}(const std::string& gfx_arch = "gfx942") {{
+    GroupedConvKernelKey key;
+    key.signature.dtype_in = DataType::FP16;
+    key.signature.dtype_wei = DataType::FP16;
+    key.signature.dtype_out = DataType::FP16;
+    key.signature.dtype_acc = DataType::FP32;
+    key.signature.layout = "nhwgc";
+    key.signature.conv_type = "{conv_type_str}";
+    key.signature.num_dims = {config.ndim_spatial};
+    key.signature.groups = 1;
+    
+    key.algorithm.tile_shape = {{{config.tile.tile_m}, {config.tile.tile_n}, {config.tile.tile_k}}};
+    key.algorithm.wave_shape = {{{config.tile.warp_m}, {config.tile.warp_n}, 1}};
+    key.algorithm.warp_tile_shape = {{{config.tile.warp_tile_m}, {config.tile.warp_tile_n}, {config.tile.warp_tile_k}}};
+    key.algorithm.pipeline = {self._pipeline_to_dispatcher(config.trait.pipeline)};
+    key.algorithm.scheduler = {self._scheduler_to_dispatcher(config.trait.scheduler)};
+    key.algorithm.epilogue = Epilogue::CShuffle;
+    key.gfx_arch = gfx_arch;
+    
+    // Create kernel instance that wraps the launcher
+    return std::make_shared<GroupedConvKernelInstance>(
+        key,
+        "{kernel_name}",
+        []({host_args_type}& args, const stream_config& cfg) -> float {{
+            return {kernel_name}_Launcher::launch(args, cfg);
+        }}
+    );
+}}
+
+}}  // namespace generated
+}}  // namespace dispatcher
+}}  // namespace ck_tile
+
+// Export launcher alias to global namespace for direct use
+using {launcher_alias} = {kernel_name}_Launcher;
+"""
+
+
+# ============================================================================
+# Configuration Parser
+# ============================================================================
+
+
+def get_default_configs(
+    arch: str = "gfx942",
+    variants: Optional[List[GroupedConvVariant]] = None,
+    ndims: Optional[List[int]] = None,
+) -> List[GroupedConvKernelConfig]:
+    """Get default grouped convolution configurations for target architecture"""
+    configs = []
+
+    if variants is None:
+        variants = [GroupedConvVariant.FORWARD]
+    if ndims is None:
+        ndims = [2]
+
+    # Valid configurations per variant (based on CK Tile example configs)
+    # Forward and Backward Data: standard GEMM-like tiles
+    fwd_bwd_data_tiles = [
+        # (tile_m, tile_n, tile_k, warp_m, warp_n, warp_tile_m, warp_tile_n, warp_tile_k)
+        (128, 128, 32, 2, 2, 32, 32, 16),  # Standard 128x128
+        (256, 256, 32, 2, 2, 32, 32, 16),  # Large 256x256
+        (64, 64, 32, 1, 4, 16, 16, 16),  # Small 64x64
+        (128, 64, 32, 2, 2, 32, 32, 16),  # Rectangular
+        (16, 64, 64, 1, 4, 16, 16, 32),  # Tall and narrow
+    ]
+
+    # Backward Weight: VERY specific tile configs that work with CK Tile's bwd_weight kernel
+    # Based on ConvConfigComputeV3 from CK Tile examples (example/ck_tile/20_grouped_convolution/)
+    # Note: Backward weight has strict constraints on warp configurations due to transpose_tile2d
+    # Only specific warp configs work: (1, 4, 1) and (4, 1, 1) are known to work
+    bwd_weight_tiles = [
+        # (tile_m, tile_n, tile_k, warp_m, warp_n, warp_tile_m, warp_tile_n, warp_tile_k)
+        # ConvConfigComputeV3: The primary working config for backward weight
+        (16, 64, 64, 1, 4, 16, 16, 32),
+    ]
+
+    for variant in variants:
+        # Select tile configs based on variant
+        if variant == GroupedConvVariant.BACKWARD_WEIGHT:
+            tile_configs = bwd_weight_tiles
+            # Backward weight ONLY supports compv3 (compv4/compv5 have transpose_tile2d issues)
+            pipelines = [("compv3", "cshuffle")]
+            # Also generate two-stage variants (fp32 workspace + elementwise convert)
+            two_stage_flags = [False, True]
+        elif variant == GroupedConvVariant.BACKWARD_DATA:
+            tile_configs = fwd_bwd_data_tiles
+            # Backward data ONLY supports compv3 (compv4 has get_length issues in bwd_data kernel)
+            pipelines = [("compv3", "cshuffle")]
+            two_stage_flags = [False]
+        else:
+            tile_configs = fwd_bwd_data_tiles
+            # Only forward grouped convolution supports both compv3 and compv4
+            pipelines = [("compv3", "cshuffle"), ("compv4", "cshuffle")]
+            two_stage_flags = [False]
+        for ndim in ndims:
+            for pipeline, epilogue in pipelines:
+                for (
+                    tile_m,
+                    tile_n,
+                    tile_k,
+                    warp_m,
+                    warp_n,
+                    warp_tile_m,
+                    warp_tile_n,
+                    warp_tile_k,
+                ) in tile_configs:
+                    for two_stage in two_stage_flags:
+                        adj_tile_k = tile_k * 2 if pipeline == "compv4" else tile_k
+
+                        trait = GroupedConvTraitConfig(
+                            pipeline=pipeline,
+                            scheduler="intrawave",
+                            epilogue=epilogue,
+                            double_smem_buffer=(pipeline == "compv4"),
+                            pad_m=True,
+                            pad_n=True,
+                            pad_k=True,
+                            two_stage=two_stage,
+                        )
+
+                        if not trait.is_valid():
+                            continue
+
+                        config = GroupedConvKernelConfig(
+                            tile=TileConfig(
+                                tile_m=tile_m,
+                                tile_n=tile_n,
+                                tile_k=adj_tile_k,
+                                warp_m=warp_m,
+                                warp_n=warp_n,
+                                warp_k=1,
+                                warp_tile_m=warp_tile_m,
+                                warp_tile_n=warp_tile_n,
+                                warp_tile_k=warp_tile_k,
+                            ),
+                            trait=trait,
+                            variant=variant,
+                            ndim_spatial=ndim,
+                            arch=arch,
+                        )
+
+                        if config.is_valid_for_arch():
+                            configs.append(config)
+
+    return configs
+
+
+def get_arch_filter():
+    """Get arch filter if available"""
+    try:
+        from arch_filter import ArchFilter
+
+        return ArchFilter
+    except ImportError:
+        return None
+
+
+# ============================================================================
+# Main Generator
+# ============================================================================
+
+
+class _GenItem:
+    """Item for parallel generation with progress logging."""
+
+    def __init__(
+        self,
+        idx: int,
+        total: int,
+        config: GroupedConvKernelConfig,
+        datatype: str,
+        variant: GroupedConvVariant,
+    ):
+        self.idx = idx
+        self.total = total
+        self.config = config
+        self.datatype = datatype
+        self.variant = variant
+
+    def __str__(self) -> str:
+        return f"kernel {self.idx}/{self.total}: {self.config.name(self.datatype)}"
+
+
+class UnifiedGroupedConvCodegen:
+    """Main grouped convolution code generator"""
+
+    def __init__(
+        self,
+        output_dir: Path,
+        gpu_target: str = "gfx942",
+        datatype: str = "fp16",
+        ndim_spatial: int = 2,
+        enable_arch_filter: bool = True,
+    ):
+        self.output_dir = output_dir
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create wrapper directory for dispatcher integration
+        self.wrapper_dir = self.output_dir / "dispatcher_wrappers"
+        self.wrapper_dir.mkdir(parents=True, exist_ok=True)
+
+        self.generated_files: List[Path] = []
+        self.generated_wrappers: List[Path] = []
+        self.gpu_target = gpu_target
+        self.datatype = datatype
+        self.ndim_spatial = ndim_spatial
+
+        # Initialize architecture filter for GPU-specific validation
+        self.arch_filter = None
+        if enable_arch_filter and HAS_ARCH_FILTER:
+            try:
+                self.arch_filter = ArchFilter(gpu_target, strict_mode=False)
+                log.info(f"Architecture filter enabled for {gpu_target}")
+            except ValueError as e:
+                log.warning(f"Could not create arch filter: {e}")
+
+    def _get_configs(self) -> List[GroupedConvKernelConfig]:
+        """Get configurations for this codegen's datatype and ndim_spatial."""
+        return get_default_configs(
+            arch=self.gpu_target,
+            variants=[
+                GroupedConvVariant.FORWARD,
+                GroupedConvVariant.BACKWARD_DATA,
+                GroupedConvVariant.BACKWARD_WEIGHT,
+            ],
+            ndims=[self.ndim_spatial],
+        )
+
+    def _get_operator_type(
+        self, variant: GroupedConvVariant
+    ) -> Optional["OperatorType"]:
+        """Map GroupedConvVariant to OperatorType for arch validation"""
+        if OperatorType is None:
+            return None
+
+        variant_to_operator = {
+            GroupedConvVariant.FORWARD: OperatorType.CONV_FWD,
+            GroupedConvVariant.BACKWARD_DATA: OperatorType.CONV_BWD_DATA,
+            GroupedConvVariant.BACKWARD_WEIGHT: OperatorType.CONV_BWD_WEIGHT,
+        }
+        return variant_to_operator.get(variant, OperatorType.CONV_FWD)
+
+    def is_config_valid(
+        self, config: GroupedConvKernelConfig, datatype: str = "fp16"
+    ) -> bool:
+        """Validate configuration against architecture constraints"""
+        if not self.arch_filter or not HAS_ARCH_FILTER:
+            return True
+
+        operator = self._get_operator_type(config.variant)
+
+        return self.arch_filter.is_kernel_valid(
+            datatype_a=datatype,
+            datatype_b=datatype,
+            datatype_c=datatype,
+            tile_m=config.tile.tile_m,
+            tile_n=config.tile.tile_n,
+            tile_k=config.tile.tile_k,
+            warp_m=config.tile.warp_m,
+            warp_n=config.tile.warp_n,
+            warp_k=1,  # Grouped conv typically uses warp_k=1
+            warp_tile_m=config.tile.warp_tile_m,
+            warp_tile_n=config.tile.warp_tile_n,
+            warp_tile_k=config.tile.warp_tile_k,
+            pipeline=config.trait.pipeline,
+            epilogue=config.trait.epilogue,
+            scheduler=config.trait.scheduler,
+            operator=operator,
+        )
+
+    def generate_kernel(
+        self,
+        config: GroupedConvKernelConfig,
+        datatype: str,
+        variant: GroupedConvVariant = GroupedConvVariant.FORWARD,
+    ) -> Tuple[Path, Path]:
+        """Generate a single kernel file and dispatcher wrapper. Returns (kernel_path, wrapper_path)."""
+        kernel_gen = CKTileGroupedConvKernelGenerator(datatype, variant)
+        wrapper_gen = GroupedConvDispatcherWrapperGenerator(datatype, variant)
+
+        kernel_name = config.name(datatype)
+        filename = f"{kernel_name}.hpp"
+        filepath = self.output_dir / filename
+
+        # Generate kernel header
+        content = kernel_gen.generate(config)
+        filepath.write_text(content)
+        self.generated_files.append(filepath)
+
+        # Generate dispatcher wrapper
+        wrapper_content = wrapper_gen.generate(config, filepath, self.output_dir)
+        wrapper_path = self.wrapper_dir / f"dispatcher_wrapper_{kernel_name}.hpp"
+        wrapper_path.write_text(wrapper_content)
+        self.generated_wrappers.append(wrapper_path)
+
+        # Generate .cpp compilation unit for per-kernel parallel builds
+        cpp_filename = f"{kernel_name}.cpp"
+        cpp_filepath = self.output_dir / cpp_filename
+        cpp_content = f"""// SPDX-License-Identifier: MIT
+// Auto-generated compilation unit for: {kernel_name}
+// Enables per-kernel parallel compilation with make -j
+
+#include "{filename}"
+
+namespace ck_tile {{ namespace generated {{
+    volatile bool _{kernel_name.replace("-", "_")}_loaded = true;
+}} }}
+"""
+        cpp_filepath.write_text(cpp_content)
+
+        return filepath, wrapper_path
+
+    def _generate_single_kernel(self, item: _GenItem):
+        """Generate one kernel (used by parallel_generate). Returns (kernel_path, wrapper_path) or raises."""
+        kernel_path, wrapper_path = self.generate_kernel(
+            item.config, item.datatype, item.variant
+        )
+        log.info(
+            "Generated kernel %d/%d: %s",
+            item.idx,
+            item.total,
+            item.config.name(item.datatype),
+        )
+        return (kernel_path, wrapper_path)
+
+    def generate_all(
+        self,
+        configs: Optional[List[GroupedConvKernelConfig]] = None,
+        datatypes: Optional[List[str]] = None,
+        parallel: bool = True,
+    ) -> dict:
+        """Generate all kernel files (optionally in parallel).
+
+        Configs are filtered using architecture validation before generation.
+        Returns dict with keys: kernels, wrappers, failed.
+        """
+        if configs is None:
+            configs = self._get_configs()
+        if datatypes is None:
+            datatypes = [self.datatype]
+
+        results = {"kernels": [], "wrappers": [], "failed": []}
+
+        # Filter configs using arch validation
+        valid_tasks = []
+        rejected_count = 0
+
+        for datatype in datatypes:
+            for config in configs:
+                if self.is_config_valid(config, datatype):
+                    valid_tasks.append((config, datatype, config.variant))
+                else:
+                    rejected_count += 1
+                    log.debug(
+                        f"Rejected config for {self.gpu_target}: "
+                        f"{config.tile.tile_m}x{config.tile.tile_n}x{config.tile.tile_k} "
+                        f"variant={config.variant.value}"
+                    )
+
+        if rejected_count > 0:
+            log.info(
+                f"Filtered {rejected_count} configs for {self.gpu_target}, "
+                f"{len(valid_tasks)} remaining"
+            )
+
+        total = len(valid_tasks)
+        items = [
+            _GenItem(i, total, config, datatype, variant)
+            for i, (config, datatype, variant) in enumerate(valid_tasks)
+        ]
+
+        def _safe_generate(item: _GenItem):
+            """Wrapper that catches exceptions for failure tracking."""
+            try:
+                k, w = self._generate_single_kernel(item)
+                return ("ok", k, w, None)
+            except Exception as e:
+                return ("fail", None, None, str(e))
+
+        raw = parallel_generate(
+            _safe_generate, items, parallel=parallel and len(items) > 1
+        )
+        for r in raw:
+            if r[0] == "ok":
+                results["kernels"].append(r[1])
+                results["wrappers"].append(r[2])
+            else:
+                results["failed"].append(r[3])
+                log.error("Failed: %s", r[3])
+
+        # Generate include_all_*.hpp headers for Python ctypes libraries
+        if results["wrappers"]:
+            self._generate_include_all_headers()
+
+        return results
+
+    def _generate_include_all_headers(self):
+        """Generate include_all_grouped_conv_*.hpp headers and registration header"""
+        # Scan output directory for ALL kernel files (not just this run's generated_files)
+        # This handles the case where fwd and bwd kernels are generated in separate make targets
+        fwd_headers = []
+        bwd_data_headers = []
+        bwd_weight_headers = []
+        fwd_kernels = []
+        bwd_data_kernels = []
+        bwd_weight_kernels = []
+
+        for filepath in self.output_dir.glob("grouped_conv_*.hpp"):
+            name = filepath.name
+            kernel_name = name[:-4]
+            if name.startswith("grouped_conv_fwd_"):
+                fwd_headers.append(name)
+                fwd_kernels.append(kernel_name)
+            elif name.startswith(("grouped_conv_bwd_data_", "grouped_conv_bwdd_")):
+                bwd_data_headers.append(name)
+                bwd_data_kernels.append(kernel_name)
+            elif name.startswith(("grouped_conv_bwd_weight_", "grouped_conv_bwdw_")):
+                bwd_weight_headers.append(name)
+                bwd_weight_kernels.append(kernel_name)
+
+        headers_to_generate = [
+            ("include_all_grouped_conv_fwd_kernels.hpp", fwd_headers, "forward"),
+            (
+                "include_all_grouped_conv_bwd_data_kernels.hpp",
+                bwd_data_headers,
+                "backward data",
+            ),
+            (
+                "include_all_grouped_conv_bwd_weight_kernels.hpp",
+                bwd_weight_headers,
+                "backward weight",
+            ),
+        ]
+
+        for header_name, kernel_headers, variant_desc in headers_to_generate:
+            header_path = self.output_dir / header_name
+            includes = "\n".join(f'#include "{h}"' for h in sorted(kernel_headers))
+
+            # Pick the first kernel as the default Selected*Launcher
+            if kernel_headers:
+                first_kernel = sorted(kernel_headers)[0][:-4]  # Remove .hpp
+                if variant_desc == "forward":
+                    launcher_alias = (
+                        f"using SelectedConvKernelLauncher = {first_kernel}_Launcher;"
+                    )
+                elif variant_desc == "backward data":
+                    launcher_alias = (
+                        f"using SelectedConvBwdDataLauncher = {first_kernel}_Launcher;"
+                    )
+                else:  # backward weight
+                    launcher_alias = f"using SelectedConvBwdWeightLauncher = {first_kernel}_Launcher;"
+            else:
+                launcher_alias = "// No kernels generated for this variant"
+
+            content = f"""// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Auto-generated header for grouped conv {variant_desc} kernels
+#pragma once
+
+{includes}
+
+// Default launcher alias (uses first kernel)
+{launcher_alias}
+"""
+            header_path.write_text(content)
+            if kernel_headers:
+                log.info(f"Generated: {header_name} ({len(kernel_headers)} kernels)")
+
+        # Generate registration header (following GEMM pattern)
+        self._generate_registration_header(
+            fwd_kernels, bwd_data_kernels, bwd_weight_kernels
+        )
+
+    def _generate_registration_header(
+        self,
+        fwd_kernels: List[str],
+        bwd_data_kernels: List[str],
+        bwd_weight_kernels: List[str],
+    ):
+        """Generate master registration header for all grouped conv kernels"""
+        # Scan wrapper directory for ALL wrapper files
+        all_wrappers = []
+        for wrapper_path in self.wrapper_dir.glob(
+            "dispatcher_wrapper_grouped_conv_*.hpp"
+        ):
+            all_wrappers.append(wrapper_path.name)
+
+        wrapper_includes = "\n".join(f'#include "{w}"' for w in sorted(all_wrappers))
+
+        # Generate registration calls
+        fwd_registrations = "\n        ".join(
+            f"registry.register_kernel(generated::make_{k}(gfx_arch), priority);"
+            for k in sorted(fwd_kernels)
+        )
+        bwd_data_registrations = "\n        ".join(
+            f"registry.register_kernel(generated::make_{k}(gfx_arch), priority);"
+            for k in sorted(bwd_data_kernels)
+        )
+        bwd_weight_registrations = "\n        ".join(
+            f"registry.register_kernel(generated::make_{k}(gfx_arch), priority);"
+            for k in sorted(bwd_weight_kernels)
+        )
+
+        content = f"""// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Auto-generated master registration header for grouped conv kernels
+#pragma once
+
+#include "ck_tile/dispatcher.hpp"
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+
+{wrapper_includes}
+
+namespace ck_tile {{
+namespace dispatcher {{
+
+using Priority = GroupedConvRegistry::Priority;
+
+inline void register_all_grouped_conv_fwd_kernels(
+    const std::string& gfx_arch = "gfx942",
+    Priority priority = Priority::Normal)
+{{
+    auto& registry = GroupedConvRegistry::instance();
+    {fwd_registrations if fwd_registrations else "// No forward kernels"}
+}}
+
+inline void register_all_grouped_conv_bwd_data_kernels(
+    const std::string& gfx_arch = "gfx942",
+    Priority priority = Priority::Normal)
+{{
+    auto& registry = GroupedConvRegistry::instance();
+    {bwd_data_registrations if bwd_data_registrations else "// No backward data kernels"}
+}}
+
+inline void register_all_grouped_conv_bwd_weight_kernels(
+    const std::string& gfx_arch = "gfx942",
+    Priority priority = Priority::Normal)
+{{
+    auto& registry = GroupedConvRegistry::instance();
+    {bwd_weight_registrations if bwd_weight_registrations else "// No backward weight kernels"}
+}}
+
+inline void register_all_grouped_conv_kernels(
+    const std::string& gfx_arch = "gfx942",
+    Priority priority = Priority::Normal)
+{{
+    register_all_grouped_conv_fwd_kernels(gfx_arch, priority);
+    register_all_grouped_conv_bwd_data_kernels(gfx_arch, priority);
+    register_all_grouped_conv_bwd_weight_kernels(gfx_arch, priority);
+}}
+
+inline std::size_t get_grouped_conv_fwd_kernel_count() {{ return {len(fwd_kernels)}; }}
+inline std::size_t get_grouped_conv_bwd_data_kernel_count() {{ return {len(bwd_data_kernels)}; }}
+inline std::size_t get_grouped_conv_bwd_weight_kernel_count() {{ return {len(bwd_weight_kernels)}; }}
+inline std::size_t get_grouped_conv_kernel_count() {{ return {len(fwd_kernels) + len(bwd_data_kernels) + len(bwd_weight_kernels)}; }}
+
+}}  // namespace dispatcher
+}}  // namespace ck_tile
+"""
+        reg_path = self.wrapper_dir / "register_all_grouped_conv_kernels.hpp"
+        reg_path.write_text(content)
+        log.info(f"Generated registration header: {reg_path}")
+
+
+# ============================================================================
+# CLI
+# ============================================================================
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Unified Grouped Convolution Code Generator"
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=Path,
+        default=Path("build/generated_kernels"),
+        help="Output directory",
+    )
+    parser.add_argument(
+        "--datatype",
+        "-d",
+        type=str,
+        nargs="+",
+        default=["fp16"],
+        choices=["fp16", "bf16", "fp32"],
+        help="Data types to generate",
+    )
+    parser.add_argument(
+        "--variant",
+        "-v",
+        type=str,
+        nargs="+",
+        default=["forward"],
+        choices=["forward", "bwd_data", "bwd_weight"],
+        help="Grouped convolution variants",
+    )
+    parser.add_argument(
+        "--ndim",
+        "-n",
+        type=int,
+        nargs="+",
+        default=[2],
+        choices=[1, 2, 3],
+        help="Spatial dimensions",
+    )
+    parser.add_argument(
+        "--arch",
+        "-a",
+        type=str,
+        default="gfx942",
+        choices=["gfx90a", "gfx942", "gfx950", "gfx1201"],
+        help="Target GPU architecture",
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--list-configs",
+        action="store_true",
+        help="List configurations without generating",
+    )
+
+    # Individual kernel configuration (when not using predefined configs)
+    parser.add_argument("--tile-m", type=int, help="Block tile M dimension")
+    parser.add_argument("--tile-n", type=int, help="Block tile N dimension")
+    parser.add_argument("--tile-k", type=int, help="Block tile K dimension")
+    parser.add_argument("--warp-m", type=int, help="Wave distribution M")
+    parser.add_argument("--warp-n", type=int, help="Wave distribution N")
+    parser.add_argument("--warp-k", type=int, default=1, help="Wave distribution K")
+    parser.add_argument("--warp-tile-m", type=int, help="Warp tile M")
+    parser.add_argument("--warp-tile-n", type=int, help="Warp tile N")
+    parser.add_argument("--warp-tile-k", type=int, default=16, help="Warp tile K")
+    parser.add_argument(
+        "--pipeline",
+        type=str,
+        choices=["mem", "compv3", "compv4", "compv5"],
+        help="Pipeline type",
+    )
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        choices=["intrawave", "interwave"],
+        help="Scheduler type",
+    )
+    parser.add_argument(
+        "--epilogue",
+        type=str,
+        default="cshuffle",
+        choices=["cshuffle", "default"],
+        help="Epilogue type",
+    )
+    parser.add_argument("--pad-m", type=bool, default=True, help="Pad M dimension")
+    parser.add_argument("--pad-n", type=bool, default=True, help="Pad N dimension")
+    parser.add_argument("--pad-k", type=bool, default=True, help="Pad K dimension")
+    parser.add_argument("--vector-a", type=int, default=4, help="Vector size A")
+    parser.add_argument("--vector-b", type=int, default=8, help="Vector size B")
+    parser.add_argument("--vector-c", type=int, default=8, help="Vector size C")
+    parser.add_argument("--block-per-cu", type=int, default=1, help="Blocks per CU")
+    parser.add_argument("--num-wave-groups", type=int, default=1, help="Wave groups")
+    parser.add_argument(
+        "--num-groups-to-merge", type=int, default=1, help="Groups to merge"
+    )
+    parser.add_argument(
+        "--double-smem-buffer",
+        type=str,
+        default=None,
+        help="Double SMEM buffer (true/false)",
+    )
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Map variant strings to enums
+    variant_map = {
+        "forward": GroupedConvVariant.FORWARD,
+        "bwd_data": GroupedConvVariant.BACKWARD_DATA,
+        "bwd_weight": GroupedConvVariant.BACKWARD_WEIGHT,
+    }
+    requested_variants = [variant_map[v] for v in args.variant]
+
+    # Check if user specified custom configuration
+    custom_config = (
+        args.tile_m is not None or args.tile_n is not None or args.pipeline is not None
+    )
+
+    if custom_config:
+        # Build custom config from CLI arguments
+        tile = TileConfig(
+            tile_m=args.tile_m or 128,
+            tile_n=args.tile_n or 128,
+            tile_k=args.tile_k or 64,
+            warp_m=args.warp_m or 2,
+            warp_n=args.warp_n or 2,
+            warp_k=args.warp_k or 1,
+            warp_tile_m=args.warp_tile_m or 32,
+            warp_tile_n=args.warp_tile_n or 32,
+            warp_tile_k=args.warp_tile_k or 16,
+        )
+        pipeline = args.pipeline or "compv4"
+        # Determine double_smem_buffer: use CLI arg if given, else default based on pipeline
+        if args.double_smem_buffer is not None:
+            dsb = args.double_smem_buffer.lower() == "true"
+        else:
+            dsb = pipeline == "compv4"  # compv4 requires double buffer
+
+        trait = GroupedConvTraitConfig(
+            pipeline=pipeline,
+            scheduler=args.scheduler or "intrawave",
+            epilogue=args.epilogue or "cshuffle",
+            pad_m=args.pad_m,
+            pad_n=args.pad_n,
+            pad_k=args.pad_k,
+            double_smem_buffer=dsb,
+            num_groups_to_merge=args.num_groups_to_merge,
+        )
+        config = GroupedConvKernelConfig(
+            tile=tile,
+            trait=trait,
+            variant=requested_variants[0]
+            if requested_variants
+            else GroupedConvVariant.FORWARD,
+            ndim_spatial=args.ndim[0] if args.ndim else 2,
+            arch=args.arch,
+            vector_size_a=args.vector_a,
+            vector_size_b=args.vector_b,
+            vector_size_c=args.vector_c,
+            block_per_cu=args.block_per_cu,
+            num_wave_groups=args.num_wave_groups,
+        )
+        filtered_configs = [config]
+    else:
+        # Get predefined configurations for target arch with requested variants and ndims
+        filtered_configs = get_default_configs(
+            arch=args.arch, variants=requested_variants, ndims=args.ndim
+        )
+
+    if args.list_configs:
+        print(f"Grouped convolution configurations for {args.arch}:")
+        print(f"  Datatypes: {args.datatype}")
+        print(f"  Variants: {args.variant}")
+        print(f"  Spatial dims: {args.ndim}")
+        print(f"\nConfigurations ({len(filtered_configs)}):")
+        for cfg in filtered_configs:
+            print(f"  - {cfg.name('fp16')}")
+            print(f"      Tile: {cfg.tile.tile_m}x{cfg.tile.tile_n}x{cfg.tile.tile_k}")
+            print(f"      Warp: {cfg.tile.warp_m}x{cfg.tile.warp_n}x{cfg.tile.warp_k}")
+            print(
+                f"      WarpTile: {cfg.tile.warp_tile_m}x{cfg.tile.warp_tile_n}x{cfg.tile.warp_tile_k}"
+            )
+            print(
+                f"      Pipeline: {cfg.trait.pipeline}, Epilogue: {cfg.trait.epilogue}, Scheduler: {cfg.trait.scheduler}"
+            )
+            print(
+                f"      Padding: M={cfg.trait.pad_m}, N={cfg.trait.pad_n}, K={cfg.trait.pad_k}"
+            )
+        return
+
+    # Generate
+    codegen = UnifiedGroupedConvCodegen(
+        output_dir=args.output,
+        gpu_target=args.arch,
+        enable_arch_filter=True,
+    )
+    results = codegen.generate_all(
+        configs=filtered_configs, datatypes=args.datatype, parallel=True
+    )
+
+    print(
+        f"\nGenerated {len(results['kernels'])} grouped convolution kernel files "
+        f"for {args.arch} in {args.output}"
+    )
+    if results["failed"]:
+        print(f"  Failed: {len(results['failed'])}")
+        for err in results["failed"][:5]:
+            print(f"    - {err}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/examples/CMakeLists.txt b/dispatcher/examples/CMakeLists.txt
index 0359eb0d8d..ab094e90cf 100644
--- a/dispatcher/examples/CMakeLists.txt
+++ b/dispatcher/examples/CMakeLists.txt
@@ -187,7 +187,6 @@ function(add_gpu_example NAME SOURCE KERNEL_HEADER)
     if(HEADER_NAME STREQUAL "register_all_kernels.hpp")
         # Registration header - examples include it directly
         target_compile_options(${NAME} PRIVATE
-            -DGEMM_KERNEL_AVAILABLE=1
             -mllvm -enable-noalias-to-md-conversion=0
             -Wno-undefined-func-template
             -Wno-float-equal
@@ -315,6 +314,7 @@ function(add_declarative_gpu_example NAME SOURCE)
     target_include_directories(${NAME} PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../../include
         ${CMAKE_CURRENT_SOURCE_DIR}/../include
+        ${CMAKE_CURRENT_SOURCE_DIR}/../..
         ${EXAMPLE_KERNEL_DIR}
         ${EXAMPLE_KERNEL_DIR}/dispatcher_wrappers
     )
@@ -322,7 +322,6 @@ function(add_declarative_gpu_example NAME SOURCE)
     # Force-include the generated registration header
     target_compile_options(${NAME} PRIVATE
         -include ${EXAMPLE_HEADER}
-        -DGEMM_KERNEL_AVAILABLE=1
         -mllvm -enable-noalias-to-md-conversion=0
         -Wno-undefined-func-template
         -Wno-float-equal
@@ -345,6 +344,56 @@ add_declarative_gpu_example(gemm_03_benchmark_validation gemm/cpp/03_benchmark_v
 add_declarative_gpu_example(gemm_04_heuristics           gemm/cpp/04_heuristics.cpp)
 add_declarative_gpu_example(gemm_05_json_export          gemm/cpp/05_json_export.cpp)
 add_declarative_gpu_example(gemm_06_multi_registry       gemm/cpp/06_multi_registry.cpp)
+add_declarative_gpu_example(gemm_07_gfx950_minimal       gemm/cpp/07_gfx950_minimal.cpp)
+
+# ML Heuristic example -- requires LightGBM shared library
+# Derive site-packages from active Python interpreter (respects virtualenvs)
+find_package(Python3 COMPONENTS Interpreter)
+
+set(LIGHTGBM_SEARCH_PATHS)
+if(Python3_FOUND AND Python3_EXECUTABLE)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_path('purelib'))"
+        OUTPUT_VARIABLE PYTHON_SITE_PACKAGES
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    if(PYTHON_SITE_PACKAGES)
+        list(APPEND LIGHTGBM_SEARCH_PATHS "${PYTHON_SITE_PACKAGES}/lightgbm/lib")
+    endif()
+endif()
+
+# Fallback to common Python 3.x site-packages if auto-detection failed
+if(NOT PYTHON_SITE_PACKAGES)
+    list(APPEND LIGHTGBM_SEARCH_PATHS
+        "$ENV{HOME}/.local/lib/python3.12/site-packages/lightgbm/lib"
+    )
+endif()
+
+find_library(LIGHTGBM_LIB NAMES LightGBM lib_lightgbm _lightgbm
+    HINTS ${CMAKE_PREFIX_PATH}
+    PATHS ${LIGHTGBM_SEARCH_PATHS}
+    NO_DEFAULT_PATH
+    DOC "LightGBM shared library for ML heuristics"
+)
+
+# Fallback: search default paths (respects LightGBM_DIR if set by user)
+if(NOT LIGHTGBM_LIB)
+    find_library(LIGHTGBM_LIB NAMES LightGBM lib_lightgbm)
+endif()
+
+if(LIGHTGBM_LIB)
+    add_declarative_gpu_example(gemm_09_ml_heuristic gemm/cpp/09_ml_heuristic.cpp)
+    target_link_libraries(gemm_09_ml_heuristic PRIVATE ${LIGHTGBM_LIB})
+    message(STATUS "LightGBM found: ${LIGHTGBM_LIB} -- building gemm_09_ml_heuristic")
+else()
+    message(STATUS "LightGBM not found -- skipping gemm_09_ml_heuristic")
+    message(STATUS "  To enable ML heuristic example:")
+    message(STATUS "    1. Activate virtualenv: source .venv/bin/activate")
+    message(STATUS "    2. Install: pip install -r ../requirements-ml.txt")
+    message(STATUS "    3. Reconfigure: cmake ..")
+    message(STATUS "  Or set CMAKE_PREFIX_PATH or LightGBM_DIR to LightGBM location")
+endif()
 
 # =============================================================================
 # GEMM Python Library - Single Fallback Kernel
@@ -394,19 +443,79 @@ if(hip_FOUND)
 endif()
 add_dependencies(dispatcher_gemm_lib generate_gemm_fallback_kernel)
 
+# =============================================================================
+# Grouped Convolution C++ Examples
+# =============================================================================
+
+add_declarative_gpu_example(grouped_conv_01_basic         grouped_conv/cpp/01_basic_grouped_conv.cpp)
+add_declarative_gpu_example(grouped_conv_02_all_dirs      grouped_conv/cpp/02_all_directions.cpp)
+add_declarative_gpu_example(grouped_conv_03_bench_val     grouped_conv/cpp/03_benchmark_validation.cpp)
+add_declarative_gpu_example(grouped_conv_04_registry_json grouped_conv/cpp/04_registry_json.cpp)
+add_declarative_gpu_example(grouped_conv_05_bwd_data      grouped_conv/cpp/05_bwd_data.cpp)
+add_declarative_gpu_example(grouped_conv_06_bwd_weight    grouped_conv/cpp/06_bwd_weight.cpp)
+add_declarative_gpu_example(grouped_conv_07_benchmark     grouped_conv/cpp/07_multi_tile_benchmark.cpp)
+
+# =============================================================================
+# Grouped Convolution Python Library - Multi-Kernel (fwd/bwd_data/bwd_weight x 2D/3D)
+# =============================================================================
+
+# Kernel output directory for the Python conv library
+set(CONV_FALLBACK_KERNEL_DIR "${CMAKE_CURRENT_BINARY_DIR}/conv_python_fallback")
+set(CONV_DISPATCH_HEADER "${CONV_FALLBACK_KERNEL_DIR}/conv_python_dispatch.hpp")
+
+# Generate ALL conv kernels (fwd/bwd_data/bwd_weight x 2D/3D x multiple tile configs)
+# then create the dispatch header with 2D/3D aliases
+add_custom_command(
+    OUTPUT ${CONV_DISPATCH_HEADER}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CONV_FALLBACK_KERNEL_DIR}
+    COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../codegen/unified_grouped_conv_codegen.py
+            --variant forward bwd_data bwd_weight --ndim 2 3
+            --datatype fp16 --arch ${GPU_TARGET}
+            --output ${CONV_FALLBACK_KERNEL_DIR}
+    COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/generate_conv_dispatch_header.py
+            --kernel-dir ${CONV_FALLBACK_KERNEL_DIR}
+            --output ${CONV_DISPATCH_HEADER}
+    COMMENT "Generating conv kernels (fwd/bwd_data/bwd_weight x 2D/3D) for Python library..."
+    VERBATIM
+)
+
+add_custom_target(generate_conv_fallback_kernels DEPENDS ${CONV_DISPATCH_HEADER})
+
+# Conv dynamic library for Python (all 6 kernel variants)
+add_library(dispatcher_conv_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/../bindings/ctypes/conv_ctypes_lib.cpp)
+target_link_libraries(dispatcher_conv_lib PRIVATE ck_tile_dispatcher)
+target_include_directories(dispatcher_conv_lib PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+    ${CMAKE_CURRENT_SOURCE_DIR}/../include
+    ${CONV_FALLBACK_KERNEL_DIR}
+)
+target_compile_options(dispatcher_conv_lib PRIVATE
+    -include ${CONV_DISPATCH_HEADER}
+    -DGFX_ARCH="${GPU_TARGET}"
+    -mllvm -enable-noalias-to-md-conversion=0
+    -Wno-undefined-func-template
+    -Wno-float-equal
+    --offload-compress
+)
+if(hip_FOUND)
+    target_link_libraries(dispatcher_conv_lib PRIVATE hip::device hip::host)
+endif()
+add_dependencies(dispatcher_conv_lib generate_conv_fallback_kernels)
+
 message(STATUS "GEMM examples configured - kernels will be generated during 'make'")
+message(STATUS "Grouped Conv examples configured - kernels will be generated during 'make'")
 
 # Convenience target to build all Python ctypes libraries
 add_custom_target(python_libs
-    DEPENDS dispatcher_gemm_lib
-    COMMENT "Building Python ctypes libraries (GEMM)"
+    DEPENDS dispatcher_gemm_lib dispatcher_conv_lib
+    COMMENT "Building Python ctypes libraries (GEMM + Conv)"
 )
 
 # =============================================================================
 # Per-Architecture Kernel Generation Targets
 # =============================================================================
 
-set(SUPPORTED_GPU_ARCHS gfx942 gfx90a gfx1100 gfx1030)
+set(SUPPORTED_GPU_ARCHS gfx942 gfx950 gfx90a gfx1100 gfx1030)
 
 foreach(ARCH ${SUPPORTED_GPU_ARCHS})
     # GEMM kernels for this arch
diff --git a/dispatcher/examples/README.md b/dispatcher/examples/README.md
index fdee9c3583..24bea821ba 100644
--- a/dispatcher/examples/README.md
+++ b/dispatcher/examples/README.md
@@ -1,8 +1,6 @@
 # CK Tile Dispatcher Examples
 
-Comprehensive examples for GEMM operations with GPU execution.
-
-> **Note**: Convolution examples have been moved to `ck-2/conv_archive/` for reference.
+Comprehensive examples for GEMM and Grouped Convolution operations with GPU execution.
 
 ---
 
@@ -60,11 +58,11 @@ python3 examples/gemm/python/08_heuristics.py
 
 ```
 examples/
-├── gemm/
-│   ├── cpp/           # 6 C++ GEMM examples
-│   └── python/        # 11 Python GEMM examples
-│
-└── README.md
+|---- gemm/
+|   |---- cpp/           # 6 C++ GEMM examples
+|   +---- python/        # 11 Python GEMM examples
+|
++---- README.md
 ```
 
 ---
@@ -201,10 +199,31 @@ rocminfo | grep "Name:"
 
 ---
 
-## Archived Examples
+## Grouped Convolution
 
-Convolution examples have been archived to `ck-2/conv_archive/dispatcher/`:
-- `examples/conv/cpp/` - 11 C++ convolution examples
-- `examples/conv/python/` - 14 Python convolution examples
+Grouped convolution support has been re-introduced with a unified infrastructure shared with GEMM.
 
-See the archive for convolution functionality reference.
+### Infrastructure
+
+The grouped convolution code generation, utilities, and build scripts are available:
+
+| Component | Location |
+|-----------|----------|
+| C++ Headers | `include/ck_tile/dispatcher/grouped_conv_*.hpp` |
+| Python Codegen | `codegen/unified_grouped_conv_codegen.py` |
+| Python Utils | `python/grouped_conv_utils.py` |
+| Build Script | `scripts/compile_grouped_conv_examples.py` |
+
+### Building Grouped Conv Kernels
+
+```bash
+# Generate grouped conv kernels
+python3 codegen/unified_grouped_conv_codegen.py \
+    --output-dir build/generated_kernels \
+    --datatype fp16 --variant forward --ndim-spatial 2
+
+# Compile a grouped conv example
+python3 scripts/compile_grouped_conv_examples.py my_grouped_conv_example.cpp
+```
+
+See the [main README](../README.md#grouped-convolution-support) for more details.
diff --git a/dispatcher/examples/gemm/cpp/02_multi_size.cpp b/dispatcher/examples/gemm/cpp/02_multi_size.cpp
index 5e620209f4..ffd2858be4 100644
--- a/dispatcher/examples/gemm/cpp/02_multi_size.cpp
+++ b/dispatcher/examples/gemm/cpp/02_multi_size.cpp
@@ -21,9 +21,9 @@
  *   - pipeline: "compv3"               -> 1 option (compv4 requires special handling)
  *   - scheduler: "intrawave"           -> 1 option
  *
- * Raw expansion: 3 × 2 = 6 configs, but arch filter validates each:
- *   - tile_m must be divisible by (warp_m × warp_tile_m)
- *   - tile_n must be divisible by (warp_n × warp_tile_n)
+ * Raw expansion: 3 x 2 = 6 configs, but arch filter validates each:
+ *   - tile_m must be divisible by (warp_m x warp_tile_m)
+ *   - tile_n must be divisible by (warp_n x warp_tile_n)
  *   - Some wave/warp combos invalid: (4,1,1)+(32,32,16), (1,4,1)+(32,32,16)
  * Result: 4 valid wildcard kernels + 1 explicit = 5 total
  *
@@ -70,13 +70,13 @@ DECL_KERNEL_SET(multi_size_kernels,
                     .add(Signature().dtype("fp16").layout("rcr"),
                          Algorithm()
                              .tile(64, 64, 64)
-                             .wave(ANY_INT, ANY_INT, 1) // ANY_INT → (1,4,1), (2,2,1), (4,1,1)
-                             .warp(-1, -1, -1) // -1 same as ANY_INT → (16,16,32), (32,32,16)
-                             .pipeline("*")    // "*" → valid pipelines
-                             .scheduler("*")   // "*" → valid schedulers
+                             .wave(ANY_INT, ANY_INT, 1) // ANY_INT -> (1,4,1), (2,2,1), (4,1,1)
+                             .warp(-1, -1, -1) // -1 same as ANY_INT -> (16,16,32), (32,32,16)
+                             .pipeline("*")    // "*" -> valid pipelines
+                             .scheduler("*")   // "*" -> valid schedulers
                              .epilogue("cshuffle"),
                          "gfx942"));
-// Raw: 3×2=6, arch filter removes 2 invalid → 4 valid kernels
+// Raw: 3x2=6, arch filter removes 2 invalid -> 4 valid kernels
 
 // =============================================================================
 // MAIN
@@ -116,8 +116,8 @@ int main(int argc, char* argv[])
     .pipeline("*")              -> expands to valid pipelines = 1
     .scheduler("*")             -> expands to valid schedulers = 1
 
-  Expanded: 3 × 2 = 6 configs, but arch filter validates each:
-    - wave×warp must divide tile: (4,1,1)×(32,32,16) invalid for 64x64
+  Expanded: 3 x 2 = 6 configs, but arch filter validates each:
+    - wave x warp must divide tile: (4,1,1)x(32,32,16) invalid for 64x64
     - Result: 4 valid kernels from wildcard + 1 explicit = 5 total
 )";
 
diff --git a/dispatcher/examples/gemm/cpp/07_gfx950_minimal.cpp b/dispatcher/examples/gemm/cpp/07_gfx950_minimal.cpp
new file mode 100644
index 0000000000..7e62ad2e4f
--- /dev/null
+++ b/dispatcher/examples/gemm/cpp/07_gfx950_minimal.cpp
@@ -0,0 +1,191 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * Example 07: Minimal gfx950 (CDNA4 / MI350) GEMM
+ *
+ * Demonstrates the dispatcher working with gfx950-specific kernels:
+ *
+ *  - fp16 GEMM with standard tile configs
+ *  - fp8 GEMM with gfx950-extended warp tiles (16x16x128)
+ *  - 160KB LDS: gfx950 doubles the LDS from 64KB to 160KB
+ *
+ * Build: cd dispatcher/build && cmake .. -DGPU_TARGETS=gfx950 && make gemm_07_gfx950_minimal
+ */
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+
+#include "ck_tile/dispatcher.hpp"
+#include "ck_tile/dispatcher/kernel_decl.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::backends;
+using namespace ck_tile::dispatcher::utils;
+using Signature = decl::Signature;
+using Algorithm = decl::Algorithm;
+
+// =============================================================================
+// gfx950-targeted kernel declarations
+// =============================================================================
+
+DECL_KERNEL_SET(gfx950_gemm_kernels,
+
+                // fp16 128x128x32 -- bread-and-butter config, works on all CDNA
+                .add(Signature().dtype("fp16").layout("rcr"),
+                     Algorithm()
+                         .tile(128, 128, 32)
+                         .wave(2, 2, 1)
+                         .warp(32, 32, 16)
+                         .pipeline("compv3")
+                         .scheduler("intrawave")
+                         .epilogue("cshuffle"),
+                     "gfx950")
+
+                    // fp16 128x128x64 -- deeper K tile using more LDS
+                    // LDS usage: 128*64*2 + 128*64*2 = 32768 bytes (fits 64KB, gfx950 has 160KB)
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(128, 128, 64)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx950")
+
+                    // fp16 64x64x32 -- small-tile variant for small problems
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(64, 64, 32)
+                             .wave(2, 2, 1)
+                             .warp(16, 16, 32)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx950"));
+
+// =============================================================================
+// MAIN
+// =============================================================================
+
+int main(int argc, char* argv[])
+{
+    ExampleArgs args("Example 07: gfx950 Minimal GEMM",
+                     "Demonstrates gfx950 (CDNA4 / MI350) dispatcher");
+    args.add_flag("--list", "List registered kernels");
+    args.add_flag("--list-verbose", "List registered kernels with full details");
+    args.add_option("--M", "1024", "Problem M dimension");
+    args.add_option("--N", "1024", "Problem N dimension");
+    args.add_option("--K", "1024", "Problem K dimension");
+    args.add_option("--arch", "gfx950", "GPU architecture (default: gfx950)");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    std::string gfx_arch = args.get("--arch", "gfx950");
+
+    print_header("Example 07: gfx950 (CDNA4) Minimal GEMM");
+
+    // =========================================================================
+    // Architecture info
+    // =========================================================================
+    std::cout << "\ngfx950 (CDNA4 / MI350) highlights:\n";
+    std::cout << "  - 160KB LDS (up from 64KB on gfx942)\n";
+    std::cout << "  - Extended FP8 warp tiles: 16x16x128, 32x32x64\n";
+    std::cout << "  - Packed FP4 support (pk_fp4)\n";
+    std::cout << "  - Same warp configs as gfx942: [1,4,1], [2,2,1], [4,1,1]\n\n";
+
+    // =========================================================================
+    // Register kernels
+    // =========================================================================
+    std::cout << "Registering kernels for " << gfx_arch << "...\n";
+
+    Registry registry;
+    registry.set_name("gfx950_gemm");
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+
+    std::cout << "  Registered " << registry.size() << " kernel(s)\n";
+
+    if(args.has("--list") || args.has("--list-verbose"))
+    {
+        std::cout << "\n";
+        print_registered_kernels(registry, std::cout, args.has("--list-verbose"));
+        return 0;
+    }
+
+    if(registry.size() == 0)
+    {
+        std::cerr << "ERROR: No kernels registered for " << gfx_arch << "!\n";
+        std::cerr << "  Did you build with -DGPU_TARGETS=gfx950?\n";
+        return 1;
+    }
+
+    // =========================================================================
+    // Create Dispatcher
+    // =========================================================================
+    Dispatcher dispatcher(&registry);
+
+    // =========================================================================
+    // Setup Problem
+    // =========================================================================
+    const int M = args.get_int("--M", 1024);
+    const int N = args.get_int("--N", 1024);
+    const int K = args.get_int("--K", 1024);
+
+    std::cout << "\nProblem: " << M << " x " << N << " x " << K << "\n";
+
+    Problem problem(M, N, K);
+
+    using DataType = ck_tile::fp16_t;
+    GpuBuffer<DataType> a_dev(M * K);
+    GpuBuffer<DataType> b_dev(K * N);
+    GpuBuffer<DataType> c_dev(M * N);
+
+    std::vector<DataType> a_host(M * K, DataType(1.0f));
+    std::vector<DataType> b_host(K * N, DataType(1.0f));
+    a_dev.copy_from_host(a_host.data());
+    b_dev.copy_from_host(b_host.data());
+    c_dev.zero();
+
+    // =========================================================================
+    // Select and Run
+    // =========================================================================
+    auto selected = dispatcher.select_kernel(problem);
+    if(!selected)
+    {
+        std::cerr << "ERROR: No suitable kernel found for " << M << "x" << N << "x" << K << "\n";
+        return 1;
+    }
+    std::cout << "  Selected: " << selected->get_name() << "\n";
+
+    float time_ms = dispatcher.run(a_dev.get(), b_dev.get(), c_dev.get(), problem, nullptr);
+    std::cout << "  Time:   " << std::fixed << std::setprecision(4) << time_ms << " ms\n";
+    std::cout << "  TFLOPS: " << std::setprecision(2) << calculate_tflops(M, N, K, time_ms) << "\n";
+
+    // =========================================================================
+    // Verify
+    // =========================================================================
+    std::cout << "\nVerification:\n";
+    std::vector<DataType> c_host(M * N);
+    c_dev.copy_to_host(c_host.data());
+
+    const float expected = static_cast<float>(K);
+    int errors           = 0;
+    for(int i = 0; i < std::min(M * N, 1024); ++i)
+    {
+        if(std::abs(static_cast<float>(c_host[i]) - expected) > 0.01f * expected + 1.0f)
+            ++errors;
+    }
+
+    bool passed = (errors == 0);
+    std::cout << "  Expected value: " << expected << "\n";
+    std::cout << "  Errors (first 1024 elements): " << errors << "\n";
+    std::cout << "  Status: " << (passed ? "PASS" : "FAIL") << "\n";
+
+    print_separator();
+    return passed ? 0 : 1;
+}
diff --git a/dispatcher/examples/gemm/cpp/09_ml_heuristic.cpp b/dispatcher/examples/gemm/cpp/09_ml_heuristic.cpp
new file mode 100644
index 0000000000..cec6d1cd02
--- /dev/null
+++ b/dispatcher/examples/gemm/cpp/09_ml_heuristic.cpp
@@ -0,0 +1,211 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * Example 09: ML-Based Kernel Selection (Native C++)
+ *
+ * Uses a trained LightGBM model loaded via the C API to predict TFLOPS
+ * for each kernel in the registry and select the best one. The kernels
+ * are JIT-compiled at build time via DECL_KERNEL_SET (same as other examples).
+ *
+ * Build: cd dispatcher/build && cmake .. && make gemm_09_ml_heuristic
+ * Run:   ./gemm_09_ml_heuristic --model <path_to_model.lgbm>
+ */
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <chrono>
+
+#include "ck_tile/dispatcher.hpp"
+#include "ck_tile/dispatcher/kernel_decl.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+#include "ck_tile/dispatcher/ml_heuristic.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::utils;
+using Signature = decl::Signature;
+using Algorithm = decl::Algorithm;
+
+// Multiple kernel configs for ML to choose from
+DECL_KERNEL_SET(ml_kernels,
+                // Small tiles
+                .add(Signature().dtype("fp16").layout("rcr"),
+                     Algorithm()
+                         .tile(64, 64, 32)
+                         .wave(2, 2, 1)
+                         .warp(16, 16, 16)
+                         .pipeline("compv3")
+                         .scheduler("intrawave")
+                         .epilogue("cshuffle"),
+                     "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(64, 64, 64)
+                             .wave(2, 2, 1)
+                             .warp(16, 16, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    // Medium tiles
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(128, 128, 32)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(128, 128, 64)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(128, 128, 64)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv4")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    // Large tiles
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(256, 256, 32)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(256, 128, 32)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(128, 256, 32)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942"));
+
+int main(int argc, char* argv[])
+{
+    ExampleArgs args("Example 09: ML-Based Kernel Selection",
+                     "Uses trained LightGBM model for kernel selection");
+    args.add_option("--arch", "gfx942", "GPU architecture");
+    args.add_option("--model", "", "Path to LightGBM model file (.lgbm)");
+    args.add_option("--log_transform", "false", "Model uses log1p transform");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    print_header("Example 09: ML-Based Kernel Selection");
+
+    std::string gfx_arch   = args.get("--arch", "gfx942");
+    std::string model_path = args.get("--model", "");
+    bool log_transform     = (args.get("--log_transform", "false") == "true");
+
+    if(model_path.empty())
+    {
+        std::cerr << "Error: --model <path> is required" << std::endl;
+        std::cerr << "Usage: ./gemm_09_ml_heuristic --model path/to/model_tflops.lgbm" << std::endl;
+        return 1;
+    }
+
+    // Setup Registry (kernels are JIT compiled from DECL_KERNEL_SET above)
+    Registry registry;
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+    std::cout << "Registry: " << registry.size() << " kernel(s)" << std::endl;
+
+    // Load ML model and create heuristic
+    HardwareProfile hw;
+    MLHeuristic ml_heuristic(model_path, &registry, hw, log_transform);
+    if(!ml_heuristic.is_loaded())
+    {
+        std::cerr << "Failed to load model. Exiting." << std::endl;
+        return 1;
+    }
+
+    // Wire ML heuristic into dispatcher
+    Dispatcher dispatcher(&registry);
+    dispatcher.set_strategy(Dispatcher::SelectionStrategy::Heuristic);
+    dispatcher.set_heuristic([&ml_heuristic](const Problem& p) { return ml_heuristic(p); });
+
+    std::cout << "Strategy: ML Heuristic (LightGBM)" << std::endl;
+
+    // Test with different problem sizes
+    using DataType                               = ck_tile::fp16_t;
+    std::vector<std::tuple<int, int, int>> sizes = {
+        {128, 128, 64},
+        {512, 512, 256},
+        {1024, 1024, 512},
+        {2048, 2048, 1024},
+    };
+
+    std::cout << std::endl
+              << std::setw(20) << "Shape" << std::setw(30) << "Selected Kernel" << std::setw(15)
+              << "Pred TFLOPS" << std::setw(12) << "Select ms" << std::setw(10) << "Status"
+              << std::endl;
+    std::cout << std::string(87, '-') << std::endl;
+
+    bool all_passed = true;
+
+    for(const auto& [M, N, K] : sizes)
+    {
+        Problem problem;
+        problem.M       = M;
+        problem.N       = N;
+        problem.K       = K;
+        problem.k_batch = 1;
+
+        auto t0          = std::chrono::high_resolution_clock::now();
+        auto kernel      = dispatcher.select_kernel(problem);
+        auto t1          = std::chrono::high_resolution_clock::now();
+        double select_ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+
+        std::string size_str =
+            std::to_string(M) + "x" + std::to_string(N) + "x" + std::to_string(K);
+
+        if(!kernel)
+        {
+            std::cout << std::setw(20) << size_str << std::setw(30) << "NONE" << std::setw(15)
+                      << "N/A" << std::setw(12) << std::fixed << std::setprecision(2) << select_ms
+                      << std::setw(10) << "FAIL" << std::endl;
+            all_passed = false;
+            continue;
+        }
+
+        double pred      = ml_heuristic.predict_tflops(problem, kernel->get_key());
+        std::string name = kernel->get_key().encode_identifier();
+        if(name.length() > 27)
+            name = name.substr(0, 27) + "..";
+
+        std::cout << std::setw(20) << size_str << std::setw(30) << name << std::setw(15)
+                  << std::fixed << std::setprecision(2) << pred << std::setw(12)
+                  << std::setprecision(2) << select_ms << std::setw(10) << "OK" << std::endl;
+    }
+
+    std::cout << std::endl
+              << (all_passed ? "*** ALL TESTS PASSED ***" : "*** SOME TESTS FAILED ***")
+              << std::endl;
+
+    return all_passed ? 0 : 1;
+}
diff --git a/dispatcher/examples/gemm/cpp/README.md b/dispatcher/examples/gemm/cpp/README.md
index 1d81a90a0e..79d60d1198 100644
--- a/dispatcher/examples/gemm/cpp/README.md
+++ b/dispatcher/examples/gemm/cpp/README.md
@@ -29,14 +29,14 @@ cd examples
 
 ## Examples
 
-| Example | Description | Complexity |
-|---------|-------------|------------|
-| [01_basic_gemm.cpp](01_basic_gemm.cpp) | Basic GEMM with declarative API, autofill, autocorrect | ★☆☆☆☆ |
-| [02_multi_size.cpp](02_multi_size.cpp) | Wildcard expansion for multiple configurations | ★★☆☆☆ |
-| [03_benchmark_validation.cpp](03_benchmark_validation.cpp) | Performance benchmarking with CPU reference validation | ★★☆☆☆ |
-| [04_heuristics.cpp](04_heuristics.cpp) | Heuristic-based kernel selection | ★★★☆☆ |
-| [05_json_export.cpp](05_json_export.cpp) | Registry JSON export for external tools | ★★☆☆☆ |
-| [06_multi_registry.cpp](06_multi_registry.cpp) | Multiple registries with named kernel sets | ★★★☆☆ |
+| Example | Description |
+|---------|-------------|
+| [01_basic_gemm.cpp](01_basic_gemm.cpp) | Basic GEMM with declarative API, autofill, autocorrect |
+| [02_multi_size.cpp](02_multi_size.cpp) | Wildcard expansion for multiple configurations |
+| [03_benchmark_validation.cpp](03_benchmark_validation.cpp) | Performance benchmarking with CPU reference validation |
+| [04_heuristics.cpp](04_heuristics.cpp) | Heuristic-based kernel selection |
+| [05_json_export.cpp](05_json_export.cpp) | Registry JSON export for external tools |
+| [06_multi_registry.cpp](06_multi_registry.cpp) | Multiple registries with named kernel sets |
 
 ## Example Details
 
@@ -225,5 +225,5 @@ DECL_KERNEL_SET(my_kernels,
 ## Related Documentation
 
 - [Python GEMM Examples](../python/README.md)
-- [Convolution Examples](../../conv/cpp/README.md)
+- [C++ Headers (GEMM + Grouped Conv)](../../../include/ck_tile/dispatcher/README.md)
 - [Main Dispatcher README](../../../README.md)
diff --git a/dispatcher/examples/gemm/python/01_basic_gemm.py b/dispatcher/examples/gemm/python/01_basic_gemm.py
index 93a78d24d1..8c23da89e2 100644
--- a/dispatcher/examples/gemm/python/01_basic_gemm.py
+++ b/dispatcher/examples/gemm/python/01_basic_gemm.py
@@ -7,41 +7,37 @@
 Example 01: Basic GEMM with Multiple Kernels
 
 Demonstrates:
-1. Declaring multiple kernel configurations
-2. Printing all registered kernels
-3. Running each kernel and validating output
+1. Building a Registry with multiple kernel configurations
+2. Parallel JIT compilation via registry.build()
+3. Running each kernel and validating output against NumPy reference
 4. Comparing performance across kernels
 
-Complexity: ★★☆☆☆
-
 Usage:
     python3 01_basic_gemm.py
-    python3 01_basic_gemm.py --help
     python3 01_basic_gemm.py --dtype bf16
     python3 01_basic_gemm.py --size 2048
+    python3 01_basic_gemm.py --num-kernels 4
+    python3 01_basic_gemm.py --workers 4
 """
 
 import sys
+import time
 import argparse
 from pathlib import Path
 from dataclasses import dataclass
-from typing import List
 
 sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
 import numpy as np
 
 from ctypes_utils import (
     KernelConfig,
-    setup_gemm_dispatcher,
-    cleanup_gemm,
-    reset_for_example,
+    Registry,
+    detect_gpu_arch,
 )
 
 
 @dataclass
 class KernelSpec:
-    """Specification for a kernel configuration"""
-
     name: str
     tile_m: int
     tile_n: int
@@ -50,80 +46,37 @@ class KernelSpec:
     scheduler: str = "intrawave"
 
 
-# Define multiple kernel configurations to test (50+ kernels)
 KERNEL_SPECS = [
-    # Small tiles - compv3
+    # Small tiles
     KernelSpec("small_64x64_k32", 64, 64, 32, "compv3"),
     KernelSpec("small_64x64_k64", 64, 64, 64, "compv3"),
-    # Small tiles - compv4
     KernelSpec("small_64x64_v4_k32", 64, 64, 32, "compv4"),
-    KernelSpec("small_64x64_v4_k64", 64, 64, 64, "compv4"),
-    # Medium tiles - compv3
+    # Medium tiles
     KernelSpec("med_128x128_k32", 128, 128, 32, "compv3"),
     KernelSpec("med_128x128_k64", 128, 128, 64, "compv3"),
-    KernelSpec("med_128x128_k128", 128, 128, 128, "compv3"),
-    # Medium tiles - compv4
     KernelSpec("med_128x128_v4_k32", 128, 128, 32, "compv4"),
     KernelSpec("med_128x128_v4_k64", 128, 128, 64, "compv4"),
-    KernelSpec("med_128x128_v4_k128", 128, 128, 128, "compv4"),
-    # Rectangular tiles - compv3
+    # Rectangular tiles
     KernelSpec("rect_64x128_k32", 64, 128, 32, "compv3"),
     KernelSpec("rect_64x128_k64", 64, 128, 64, "compv3"),
     KernelSpec("rect_128x64_k32", 128, 64, 32, "compv3"),
     KernelSpec("rect_128x64_k64", 128, 64, 64, "compv3"),
-    # Rectangular tiles - compv4
     KernelSpec("rect_64x128_v4_k32", 64, 128, 32, "compv4"),
-    KernelSpec("rect_64x128_v4_k64", 64, 128, 64, "compv4"),
     KernelSpec("rect_128x64_v4_k32", 128, 64, 32, "compv4"),
-    KernelSpec("rect_128x64_v4_k64", 128, 64, 64, "compv4"),
-    # Large tiles - compv3
+    # Large tiles
     KernelSpec("large_256x128_k32", 256, 128, 32, "compv3"),
-    KernelSpec("large_256x128_k64", 256, 128, 64, "compv3"),
     KernelSpec("large_128x256_k32", 128, 256, 32, "compv3"),
-    KernelSpec("large_128x256_k64", 128, 256, 64, "compv3"),
     KernelSpec("large_256x256_k32", 256, 256, 32, "compv3"),
-    KernelSpec("large_256x256_k64", 256, 256, 64, "compv3"),
-    # Large tiles - compv4
     KernelSpec("large_256x128_v4_k32", 256, 128, 32, "compv4"),
-    KernelSpec("large_256x128_v4_k64", 256, 128, 64, "compv4"),
-    KernelSpec("large_128x256_v4_k32", 128, 256, 32, "compv4"),
-    KernelSpec("large_128x256_v4_k64", 128, 256, 64, "compv4"),
     KernelSpec("large_256x256_v4_k32", 256, 256, 32, "compv4"),
-    KernelSpec("large_256x256_v4_k64", 256, 256, 64, "compv4"),
-    # Interwave scheduler variants
-    KernelSpec("int_64x64_k32", 64, 64, 32, "compv3", "interwave"),
+    # Interwave scheduler
     KernelSpec("int_128x128_k32", 128, 128, 32, "compv3", "interwave"),
-    KernelSpec("int_128x128_k64", 128, 128, 64, "compv3", "interwave"),
     KernelSpec("int_256x128_k32", 256, 128, 32, "compv3", "interwave"),
-    # More tile_k variations - compv3
-    KernelSpec("med_128x128_k16", 128, 128, 16, "compv3"),
-    KernelSpec("rect_64x128_k16", 64, 128, 16, "compv3"),
-    KernelSpec("rect_128x64_k16", 128, 64, 16, "compv3"),
-    # More tile_k variations - compv4
-    KernelSpec("med_128x128_v4_k16", 128, 128, 16, "compv4"),
-    KernelSpec("rect_64x128_v4_k16", 64, 128, 16, "compv4"),
-    KernelSpec("rect_128x64_v4_k16", 128, 64, 16, "compv4"),
-    # Additional rectangular
-    KernelSpec("rect_32x64_k32", 32, 64, 32, "compv3"),
-    KernelSpec("rect_64x32_k32", 64, 32, 32, "compv3"),
-    KernelSpec("rect_32x128_k32", 32, 128, 32, "compv3"),
-    KernelSpec("rect_128x32_k32", 128, 32, 32, "compv3"),
-    # Additional compv4 variants
-    KernelSpec("rect_32x64_v4_k32", 32, 64, 32, "compv4"),
-    KernelSpec("rect_64x32_v4_k32", 64, 32, 32, "compv4"),
-    KernelSpec("rect_32x128_v4_k32", 32, 128, 32, "compv4"),
-    KernelSpec("rect_128x32_v4_k32", 128, 32, 32, "compv4"),
 ]
 
 
-def create_kernel_config(spec: KernelSpec, dtype: str, arch: str) -> KernelConfig:
-    """Create a KernelConfig from a spec"""
-    # Adjust warp tiles based on tile size
-    if spec.tile_m <= 64:
-        warp_m, warp_n = 16, 16
-    else:
-        warp_m, warp_n = 32, 32
-
+def spec_to_config(spec: KernelSpec, dtype: str, arch: str) -> KernelConfig:
+    warp_m, warp_n = (16, 16) if spec.tile_m <= 64 else (32, 32)
     return KernelConfig(
         dtype_a=dtype,
         dtype_b=dtype,
@@ -148,180 +101,118 @@ def create_kernel_config(spec: KernelSpec, dtype: str, arch: str) -> KernelConfi
     )
 
 
-def print_kernel_table(specs: List[KernelSpec], dtype: str):
-    """Print a formatted table of kernel configurations"""
-    print("\n" + "=" * 70)
-    print(f"  DECLARED KERNEL CONFIGURATIONS ({len(specs)} kernels)")
-    print("=" * 70)
-    print(f"\n  {'#':<3} {'Name':<18} {'Tile':<14} {'Pipeline':<10} {'Scheduler':<12}")
-    print("  " + "-" * 68)
-
-    for i, spec in enumerate(specs, 1):
-        tile = f"{spec.tile_m}x{spec.tile_n}x{spec.tile_k}"
-        print(
-            f"  {i:<3} {spec.name:<18} {tile:<14} {spec.pipeline:<10} {spec.scheduler:<12}"
-        )
-
-    print("  " + "-" * 68)
-    print(f"  Data type: {dtype}")
-
-
 def main():
-    parser = argparse.ArgumentParser(
-        description="Basic GEMM Example with Multiple Kernels",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  python3 01_basic_gemm.py                    # Default FP16 with 4 kernels
-  python3 01_basic_gemm.py --dtype bf16       # BF16 mode
-  python3 01_basic_gemm.py --size 2048        # Larger problem size
-  python3 01_basic_gemm.py --num-kernels 2    # Test only 2 kernels
-        """,
-    )
+    parser = argparse.ArgumentParser(description="Basic GEMM with Multiple Kernels")
+    parser.add_argument("--dtype", default="fp16", choices=["fp16", "bf16"])
+    parser.add_argument("--arch", default=detect_gpu_arch())
+    parser.add_argument("--size", type=int, default=512, help="Problem size MxNxK")
+    parser.add_argument("--num-kernels", type=int, default=0, help="0 = all")
     parser.add_argument(
-        "--dtype",
-        default="fp16",
-        choices=["fp16", "bf16", "fp32"],
-        help="Data type (default: fp16)",
-    )
-    parser.add_argument(
-        "--arch",
-        default="gfx942",
-        help="Target architecture (default: gfx942)",
-    )
-    parser.add_argument(
-        "--size",
-        type=int,
-        default=512,
-        help="Problem size MxNxK (default: 512)",
-    )
-    parser.add_argument(
-        "--num-kernels",
-        type=int,
-        default=0,
-        help="Number of kernels to test (0 = all)",
+        "--workers", type=int, default=0, help="Max parallel JIT workers (0 = auto)"
     )
     args = parser.parse_args()
 
-    reset_for_example()
-
     print("=" * 70)
     print("Example 01: Basic GEMM with Multiple Kernels")
     print("=" * 70)
 
-    # Select kernels to test
     specs = KERNEL_SPECS[: args.num_kernels] if args.num_kernels > 0 else KERNEL_SPECS
 
-    # =========================================================================
-    # Step 1: Print all kernel configurations
-    # =========================================================================
-    print_kernel_table(specs, args.dtype)
-
-    # =========================================================================
-    # Step 2: Setup and test each kernel
-    # =========================================================================
-    print("\n" + "=" * 70)
-    print("  RUNNING KERNELS")
-    print("=" * 70)
-
-    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
-    M, N, K = args.size, args.size, args.size
-
-    results = []
-
-    print(f"\n  Problem size: {M}x{N}x{K}\n")
+    # Step 1: Build registry
     print(
-        f"  {'#':<3} {'Name':<18} {'Tile':<14} {'Time (ms)':>10} {'TFLOPS':>10} {'Max Err':>10} {'Status':<8}"
+        f"\n  {len(specs)} kernel configurations, dtype={args.dtype}, arch={args.arch}"
     )
-    print("  " + "-" * 78)
-
-    for i, spec in enumerate(specs, 1):
-        # Create unique test data per kernel
-        np.random.seed(42 + i * 1000)
-        A = (np.random.randn(M, K) * 0.1).astype(np_dtype)
-        B = (np.random.randn(K, N) * 0.1).astype(np_dtype)
-
-        # Create config and setup dispatcher
-        config = create_kernel_config(spec, args.dtype, args.arch)
-
-        setup = setup_gemm_dispatcher(
-            config=config,
-            registry_name=f"kernel_{spec.name}",
-            verbose=False,
-            auto_rebuild=True,
+    print(f"\n  {'#':<3} {'Name':<22} {'Tile':<14} {'Pipeline':<10} {'Scheduler':<12}")
+    print("  " + "-" * 64)
+    for i, s in enumerate(specs, 1):
+        print(
+            f"  {i:<3} {s.name:<22} {s.tile_m}x{s.tile_n}x{s.tile_k:<6} {s.pipeline:<10} {s.scheduler:<12}"
         )
 
+    reg = Registry(name="basic_gemm")
+    for s in specs:
+        reg.register_kernel(spec_to_config(s, args.dtype, args.arch))
+
+    # Step 2: Parallel JIT build via registry.build()
+    workers = args.workers if args.workers > 0 else None
+    print(
+        f"\n--- Parallel JIT Build ({len(specs)} kernels{f', workers={workers}' if workers else ''}) ---"
+    )
+
+    t0 = time.perf_counter()
+    setups = reg.build(verbose=False, max_workers=workers)
+    jit_build_s = time.perf_counter() - t0
+
+    built = sum(1 for s in setups if s.success)
+    print(f"  Built: {built}/{len(specs)} kernels in {jit_build_s:.1f} s")
+
+    if built == 0:
+        print("  ERROR: No kernels built")
+        return 1
+
+    # Step 3: Run each kernel and validate
+    print(f"\n--- Running Kernels (problem {args.size}x{args.size}x{args.size}) ---")
+    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
+    M = N = K = args.size
+
+    np.random.seed(42)
+    A = (np.random.randn(M, K) * 0.1).astype(np_dtype)
+    B = (np.random.randn(K, N) * 0.1).astype(np_dtype)
+    C_ref = np.matmul(A.astype(np.float32), B.astype(np.float32)).astype(np_dtype)
+
+    print(
+        f"\n  {'#':<3} {'Name':<22} {'Tile':<14} {'Time(ms)':>10} {'TFLOPS':>10} {'MaxErr':>10} {'Status':<6}"
+    )
+    print("  " + "-" * 80)
+
+    results = []
+    for i, (spec, setup) in enumerate(zip(specs, setups), 1):
         tile = f"{spec.tile_m}x{spec.tile_n}x{spec.tile_k}"
 
         if not setup.success:
             print(
-                f"  {i:<3} {spec.name:<18} {tile:<14} {'N/A':>10} {'N/A':>10} {'N/A':>10} {'FAIL':<8}"
+                f"  {i:<3} {spec.name:<22} {tile:<14} {'---':>10} {'---':>10} {'---':>10} {'SKIP':<6}"
             )
-            results.append((spec.name, False, 0, 0, 0))
-            cleanup_gemm()
+            results.append((spec.name, False, 0.0, 0.0, 0.0))
             continue
 
-        dispatcher = setup.dispatcher
-
-        # Check if size is supported
-        if not dispatcher.is_supported(M, N, K):
+        disp = setup.dispatcher
+        if not disp.is_supported(M, N, K):
             print(
-                f"  {i:<3} {spec.name:<18} {tile:<14} {'N/A':>10} {'N/A':>10} {'N/A':>10} {'SKIP':<8}"
+                f"  {i:<3} {spec.name:<22} {tile:<14} {'---':>10} {'---':>10} {'---':>10} {'SKIP':<6}"
             )
-            results.append((spec.name, False, 0, 0, 0))
-            cleanup_gemm()
+            results.append((spec.name, False, 0.0, 0.0, 0.0))
             continue
 
-        # Run GEMM
-        result = dispatcher.run(A, B, M, N, K)
-
-        if not result.success:
+        res = disp.run(A, B, M, N, K)
+        if not res.success:
             print(
-                f"  {i:<3} {spec.name:<18} {tile:<14} {'N/A':>10} {'N/A':>10} {'N/A':>10} {'FAIL':<8}"
+                f"  {i:<3} {spec.name:<22} {tile:<14} {'---':>10} {'---':>10} {'---':>10} {'FAIL':<6}"
             )
-            results.append((spec.name, False, 0, 0, 0))
-            cleanup_gemm()
+            results.append((spec.name, False, 0.0, 0.0, 0.0))
             continue
 
-        # Validate against NumPy reference
-        C_ref = np.matmul(A.astype(np.float32), B.astype(np.float32)).astype(np_dtype)
-        max_err = np.max(np.abs(result.output - C_ref))
-
-        # Check if within tolerance
-        passed = max_err < 1e-2
-        status = "PASS" if passed else "FAIL"
-
+        max_err = float(np.max(np.abs(res.output - C_ref)))
+        ok = max_err < 1e-2
+        tag = "PASS" if ok else "FAIL"
         print(
-            f"  {i:<3} {spec.name:<18} {tile:<14} {result.time_ms:>10.4f} {result.tflops:>10.2f} {max_err:>10.2e} {status:<8}"
+            f"  {i:<3} {spec.name:<22} {tile:<14} {res.time_ms:>10.4f} {res.tflops:>10.2f} {max_err:>10.2e} {tag:<6}"
         )
-        results.append((spec.name, passed, result.time_ms, result.tflops, max_err))
-
-        cleanup_gemm()
-
-    # =========================================================================
-    # Step 3: Summary
-    # =========================================================================
-    print("\n" + "=" * 70)
-    print("  SUMMARY")
-    print("=" * 70)
+        results.append((spec.name, ok, res.time_ms, res.tflops, max_err))
 
+    # Step 4: Summary
     passed = sum(1 for r in results if r[1])
     failed = len(results) - passed
+    valid = [r for r in results if r[1]]
 
-    print(f"\n  Results: {passed}/{len(results)} kernels passed")
-    print(f"  Problem: {M}x{N}x{K}, dtype={args.dtype}")
-
-    if results:
-        valid_results = [r for r in results if r[1]]
-        if valid_results:
-            best = max(valid_results, key=lambda x: x[3])
-            print(f"\n  Best kernel: {best[0]} ({best[3]:.2f} TFLOPS)")
-
-    if failed == 0:
-        print("\n  *** ALL KERNELS PASSED ***")
-    else:
-        print(f"\n  *** {failed} KERNELS FAILED ***")
-
+    print("\n" + "=" * 70)
+    print(f"  Results:  {passed}/{len(results)} passed")
+    print(f"  Problem:  {M}x{N}x{K}, dtype={args.dtype}")
+    print(f"  JIT time: {jit_build_s:.1f} s (parallel)")
+    if valid:
+        best = max(valid, key=lambda x: x[3])
+        print(f"  Best:     {best[0]} ({best[3]:.2f} TFLOPS)")
+    print(f"  Status:   {'PASS' if failed == 0 else 'FAIL'}")
     print("=" * 70)
 
     return 0 if failed == 0 else 1
diff --git a/dispatcher/examples/gemm/python/02_batch_gemm.py b/dispatcher/examples/gemm/python/02_batch_gemm.py
index 039aba2790..745ec1c494 100644
--- a/dispatcher/examples/gemm/python/02_batch_gemm.py
+++ b/dispatcher/examples/gemm/python/02_batch_gemm.py
@@ -6,9 +6,7 @@
 """
 Example 02: Batch GEMM
 
-Runs multiple GEMM operations with different sizes.
-
-Complexity: ★★☆☆☆
+Runs multiple GEMM operations with different sizes using JIT compilation.
 
 Usage:
     python3 02_batch_gemm.py
@@ -25,9 +23,8 @@ import numpy as np
 
 from ctypes_utils import (
     KernelConfig,
-    setup_gemm_dispatcher,
-    cleanup_gemm,
-    reset_for_example,
+    Registry,
+    detect_gpu_arch,
 )
 
 
@@ -55,20 +52,20 @@ Examples:
         help="Maximum problem size (default: 4096)",
     )
     parser.add_argument(
-        "--arch", default="gfx942", help="Target architecture (default: gfx942)"
+        "--arch",
+        default=detect_gpu_arch(),
+        help="Target architecture (auto-detected from rocminfo)",
     )
     args = parser.parse_args()
 
-    reset_for_example()
-
     print("=" * 60)
     print("Example 02: Batch GEMM")
     print("=" * 60)
 
     # =========================================================================
-    # Step 1: Setup dispatcher
+    # Step 1: JIT build dispatcher
     # =========================================================================
-    print("\nStep 1: Setup Dispatcher")
+    print("\nStep 1: JIT Build Dispatcher")
 
     config = KernelConfig(
         dtype_a=args.dtype,
@@ -80,19 +77,22 @@ Examples:
         gfx_arch=args.arch,
     )
 
-    setup = setup_gemm_dispatcher(config, registry_name="batch_gemm", verbose=True)
-    if not setup.success:
-        print(f"  ERROR: {setup.error}")
+    reg = Registry(name="batch_gemm")
+    reg.register_kernel(config)
+
+    setups = reg.build(verbose=True)
+    if not setups or not setups[0].success:
+        error = setups[0].error if setups else "No kernels built"
+        print(f"  ERROR: {error}")
         return 1
 
-    dispatcher = setup.dispatcher
+    dispatcher = setups[0].dispatcher
 
     # =========================================================================
     # Step 2: Run batch of different sizes
     # =========================================================================
     print("\nStep 2: Run Batch")
 
-    # Generate sizes up to max_size
     all_sizes = [
         (256, 256, 256),
         (512, 512, 512),
@@ -135,9 +135,6 @@ Examples:
         avg_tflops = (total_ops / 1e12) / (total_time / 1000)
         print(f"\n  Total: {total_time:.2f} ms, Average: {avg_tflops:.2f} TFLOPS")
 
-    # Cleanup
-    cleanup_gemm()
-
     print("\n" + "=" * 60)
     print("Batch GEMM complete!")
     print("=" * 60)
diff --git a/dispatcher/examples/gemm/python/03_benchmark.py b/dispatcher/examples/gemm/python/03_benchmark.py
index bec1b7e2fb..508b3f8b35 100644
--- a/dispatcher/examples/gemm/python/03_benchmark.py
+++ b/dispatcher/examples/gemm/python/03_benchmark.py
@@ -6,9 +6,8 @@
 """
 Example 03: Benchmark
 
-Performance benchmarking with compute-optimized kernel configuration.
-
-Complexity: ★★★☆☆
+Performance benchmarking with compute-optimized kernel configuration
+using JIT compilation.
 
 Usage:
     python3 03_benchmark.py
@@ -26,9 +25,8 @@ import numpy as np
 
 from ctypes_utils import (
     KernelConfig,
-    setup_gemm_dispatcher,
-    cleanup_gemm,
-    reset_for_example,
+    Registry,
+    detect_gpu_arch,
 )
 
 
@@ -63,20 +61,20 @@ Examples:
         "--iterations", type=int, default=10, help="Benchmark iterations (default: 10)"
     )
     parser.add_argument(
-        "--arch", default="gfx942", help="Target architecture (default: gfx942)"
+        "--arch",
+        default=detect_gpu_arch(),
+        help="Target architecture (auto-detected from rocminfo)",
     )
     args = parser.parse_args()
 
-    reset_for_example()
-
     print("=" * 60)
     print("Example 03: Benchmark")
     print("=" * 60)
 
     # =========================================================================
-    # Step 1: Setup dispatcher with compute-optimized config
+    # Step 1: JIT build dispatcher with compute-optimized config
     # =========================================================================
-    print("\nStep 1: Setup Dispatcher")
+    print("\nStep 1: JIT Build Dispatcher")
 
     config = KernelConfig(
         dtype_a=args.dtype,
@@ -90,12 +88,16 @@ Examples:
         gfx_arch=args.arch,
     )
 
-    setup = setup_gemm_dispatcher(config, registry_name="benchmark", verbose=True)
-    if not setup.success:
-        print(f"  ERROR: {setup.error}")
+    reg = Registry(name="benchmark")
+    reg.register_kernel(config)
+
+    setups = reg.build(verbose=True)
+    if not setups or not setups[0].success:
+        error = setups[0].error if setups else "No kernels built"
+        print(f"  ERROR: {error}")
         return 1
 
-    dispatcher = setup.dispatcher
+    dispatcher = setups[0].dispatcher
 
     # =========================================================================
     # Step 2: Benchmark
@@ -130,11 +132,9 @@ Examples:
         A = np.random.randn(M, K).astype(np_dtype) * 0.1
         B = np.random.randn(K, N).astype(np_dtype) * 0.1
 
-        # Warmup
         for _ in range(args.warmup):
             dispatcher.run(A, B, M, N, K)
 
-        # Benchmark
         times = []
         for _ in range(args.iterations):
             result = dispatcher.run(A, B, M, N, K)
@@ -150,9 +150,6 @@ Examples:
                 f"  {M:>4}x{N:>4}x{K:<4} | {min_time:>10.4f} | {avg_time:>10.4f} | {tflops:>10.2f}"
             )
 
-    # Cleanup
-    cleanup_gemm()
-
     # Summary
     print("\n" + "=" * 60)
     print("Summary")
diff --git a/dispatcher/examples/gemm/python/04_validation.py b/dispatcher/examples/gemm/python/04_validation.py
index 2fe54c53f7..d56621c3c8 100644
--- a/dispatcher/examples/gemm/python/04_validation.py
+++ b/dispatcher/examples/gemm/python/04_validation.py
@@ -6,9 +6,7 @@
 """
 Example 04: Validation
 
-Validates GPU GEMM against NumPy reference.
-
-Complexity: ★★★☆☆
+Validates GPU GEMM against NumPy reference using JIT compilation.
 
 Usage:
     python3 04_validation.py
@@ -26,9 +24,8 @@ import numpy as np
 from ctypes_utils import (
     KernelConfig,
     Validator,
-    setup_gemm_dispatcher,
-    cleanup_gemm,
-    reset_for_example,
+    Registry,
+    detect_gpu_arch,
 )
 
 
@@ -56,20 +53,20 @@ Examples:
         "--atol", type=float, default=1e-2, help="Absolute tolerance (default: 1e-2)"
     )
     parser.add_argument(
-        "--arch", default="gfx942", help="Target architecture (default: gfx942)"
+        "--arch",
+        default=detect_gpu_arch(),
+        help="Target architecture (auto-detected from rocminfo)",
     )
     args = parser.parse_args()
 
-    reset_for_example()
-
     print("=" * 60)
     print("Example 04: Validation")
     print("=" * 60)
 
     # =========================================================================
-    # Step 1: Setup dispatcher
+    # Step 1: JIT build dispatcher
     # =========================================================================
-    print("\nStep 1: Setup Dispatcher")
+    print("\nStep 1: JIT Build Dispatcher")
 
     config = KernelConfig(
         dtype_a=args.dtype,
@@ -81,12 +78,16 @@ Examples:
         gfx_arch=args.arch,
     )
 
-    setup = setup_gemm_dispatcher(config, registry_name="validation", verbose=True)
-    if not setup.success:
-        print(f"  ERROR: {setup.error}")
+    reg = Registry(name="validation")
+    reg.register_kernel(config)
+
+    setups = reg.build(verbose=True)
+    if not setups or not setups[0].success:
+        error = setups[0].error if setups else "No kernels built"
+        print(f"  ERROR: {error}")
         return 1
 
-    dispatcher = setup.dispatcher
+    dispatcher = setups[0].dispatcher
 
     # =========================================================================
     # Step 2: Run validation tests
@@ -139,9 +140,6 @@ Examples:
             print(f"  {name:<15} | {M}x{N}x{K:<5} | {max_err:>10.2e} | FAILED")
             failed += 1
 
-    # Cleanup
-    cleanup_gemm()
-
     # Summary
     print("\n" + "=" * 60)
     total = passed + failed
diff --git a/dispatcher/examples/gemm/python/05_numpy_integration.py b/dispatcher/examples/gemm/python/05_numpy_integration.py
index 493ce46d22..b0af5fa700 100644
--- a/dispatcher/examples/gemm/python/05_numpy_integration.py
+++ b/dispatcher/examples/gemm/python/05_numpy_integration.py
@@ -8,7 +8,6 @@ Example 05: NumPy Integration
 
 Shows how to create a GPU-accelerated matmul wrapper.
 
-Complexity: ★★☆☆☆
 
 Usage:
     python3 05_numpy_integration.py
@@ -29,6 +28,7 @@ from ctypes_utils import (
     setup_gemm_dispatcher,
     cleanup_gemm,
     reset_for_example,
+    detect_gpu_arch,
 )
 
 
@@ -70,7 +70,9 @@ Examples:
         help="Data type (default: fp16)",
     )
     parser.add_argument(
-        "--arch", default="gfx942", help="Target architecture (default: gfx942)"
+        "--arch",
+        default=detect_gpu_arch(),
+        help="Target architecture (auto-detected from rocminfo)",
     )
     args = parser.parse_args()
 
diff --git a/dispatcher/examples/gemm/python/06_json_export.py b/dispatcher/examples/gemm/python/06_json_export.py
index 9e062e507b..780032ce06 100644
--- a/dispatcher/examples/gemm/python/06_json_export.py
+++ b/dispatcher/examples/gemm/python/06_json_export.py
@@ -8,7 +8,6 @@ Example 06: JSON Export
 
 Exports registry configuration to JSON.
 
-Complexity: ★★☆☆☆
 
 Usage:
     python3 06_json_export.py
@@ -28,6 +27,7 @@ from ctypes_utils import (
     setup_gemm_dispatcher,
     cleanup_gemm,
     reset_for_example,
+    detect_gpu_arch,
 )
 
 
@@ -54,7 +54,9 @@ Examples:
         help="Data type (default: fp16)",
     )
     parser.add_argument(
-        "--arch", default="gfx942", help="Target architecture (default: gfx942)"
+        "--arch",
+        default=detect_gpu_arch(),
+        help="Target architecture (auto-detected from rocminfo)",
     )
     args = parser.parse_args()
 
diff --git a/dispatcher/examples/gemm/python/07_stress_test.py b/dispatcher/examples/gemm/python/07_stress_test.py
index 8160030631..620e66eeaf 100644
--- a/dispatcher/examples/gemm/python/07_stress_test.py
+++ b/dispatcher/examples/gemm/python/07_stress_test.py
@@ -18,7 +18,6 @@ This tests:
 - Multiple data types (fp16, bf16)
 - Different schedulers (intrawave, interwave)
 
-Complexity: ★★★★☆
 
 Usage:
     python3 07_stress_test.py
@@ -43,6 +42,7 @@ from ctypes_utils import (
     cleanup_gemm,
     reset_for_example,
     Validator,
+    detect_gpu_arch,
 )
 
 
@@ -413,8 +413,8 @@ Examples:
     )
     parser.add_argument(
         "--arch",
-        default="gfx942",
-        help="Target architecture (default: gfx942)",
+        default=detect_gpu_arch(),
+        help="Target architecture (auto-detected from rocminfo, override with --arch gfxNNN)",
     )
     args = parser.parse_args()
 
diff --git a/dispatcher/examples/gemm/python/08_heuristics.py b/dispatcher/examples/gemm/python/08_heuristics.py
index e2763c0513..acbf1b3ae0 100644
--- a/dispatcher/examples/gemm/python/08_heuristics.py
+++ b/dispatcher/examples/gemm/python/08_heuristics.py
@@ -19,7 +19,6 @@ Heuristic strategies:
 - Memory-bound: Optimize memory access for bandwidth-limited cases
 - Latency-focused: Minimize kernel launch overhead for small problems
 
-Complexity: ★★★★☆
 
 Usage:
     python3 08_heuristics.py
@@ -43,6 +42,7 @@ from ctypes_utils import (
     setup_gemm_dispatcher,
     cleanup_gemm,
     reset_for_example,
+    detect_gpu_arch,
 )
 
 
@@ -561,8 +561,8 @@ Examples:
     )
     parser.add_argument(
         "--arch",
-        default="gfx942",
-        help="Target architecture (default: gfx942)",
+        default=detect_gpu_arch(),
+        help="Target architecture (auto-detected from rocminfo, override with --arch gfxNNN)",
     )
     args = parser.parse_args()
 
diff --git a/dispatcher/examples/gemm/python/09_ml_heuristic.py b/dispatcher/examples/gemm/python/09_ml_heuristic.py
new file mode 100644
index 0000000000..d6726a2033
--- /dev/null
+++ b/dispatcher/examples/gemm/python/09_ml_heuristic.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example 09: ML-Based Kernel Selection
+
+Uses a trained LightGBM model to select the optimal kernel for each problem
+size. The model predicts TFLOPS for every candidate in the kernel pool and
+picks the highest-scoring one, which is then JIT-compiled and run.
+
+This replaces the hand-crafted rules in 08_heuristics.py with a data-driven
+approach achieving 97-98% of oracle-best TFLOPS efficiency.
+
+Complexity: *****
+
+Prerequisites:
+    - Trained model in dispatcher/heuristics/models/gemm_universal_fp8_gfx950/
+    - lightgbm, pandas, numpy, pyarrow installed
+
+Usage:
+    python3 09_ml_heuristic.py
+    python3 09_ml_heuristic.py --dtype fp16 --arch gfx942
+"""
+
+import sys
+import argparse
+import time
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "heuristics"))
+
+import numpy as np
+
+from ctypes_utils import (
+    KernelConfig,
+    setup_gemm_dispatcher,
+    cleanup_gemm,
+)
+
+from predict import Predictor
+
+
+@dataclass
+class KernelSpec:
+    """Kernel specification -- same structure as 08_heuristics.py"""
+
+    name: str
+    tile_m: int
+    tile_n: int
+    tile_k: int
+    pipeline: str = "compv3"
+    scheduler: str = "intrawave"
+    wave_m: int = 2
+    wave_n: int = 2
+    wave_k: int = 1
+    warp_m: int = 32
+    warp_n: int = 32
+    warp_k: int = 16
+
+
+# Kernel pool: representative configs spanning small to large tiles,
+# compv3/compv4/mem pipelines, and intrawave/interwave schedulers.
+KERNEL_POOL = [
+    # Small tiles
+    KernelSpec("s_64x64_k32_v3", 64, 64, 32, "compv3", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k64_v3", 64, 64, 64, "compv3", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k128_v3", 64, 64, 128, "compv3", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k32_v4", 64, 64, 32, "compv4", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k64_mem", 64, 64, 64, "mem", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k128_mem", 64, 64, 128, "mem", warp_m=16, warp_n=16),
+    # Medium tiles
+    KernelSpec("m_128x128_k32_v3", 128, 128, 32, "compv3"),
+    KernelSpec("m_128x128_k64_v3", 128, 128, 64, "compv3"),
+    KernelSpec("m_128x128_k128_v3", 128, 128, 128, "compv3"),
+    KernelSpec("m_128x128_k32_v4", 128, 128, 32, "compv4"),
+    KernelSpec("m_128x128_k64_v4", 128, 128, 64, "compv4"),
+    KernelSpec("m_128x128_k64_mem", 128, 128, 64, "mem"),
+    KernelSpec("m_128x128_k128_mem", 128, 128, 128, "mem"),
+    # Rectangular medium
+    KernelSpec("r_64x128_k32", 64, 128, 32, "compv3", warp_m=16),
+    KernelSpec("r_128x64_k32", 128, 64, 32, "compv3", warp_n=16),
+    KernelSpec("r_64x128_k64", 64, 128, 64, "compv3", warp_m=16),
+    KernelSpec("r_128x64_k64", 128, 64, 64, "compv3", warp_n=16),
+    # Large tiles
+    KernelSpec("l_256x128_k32", 256, 128, 32, "compv3"),
+    KernelSpec("l_128x256_k32", 128, 256, 32, "compv3"),
+    KernelSpec("l_256x256_k32", 256, 256, 32, "compv3"),
+    KernelSpec("l_256x256_k64", 256, 256, 64, "compv3"),
+    # Interwave variants
+    KernelSpec("m_128x128_k64_iw", 128, 128, 64, "compv3", "interwave"),
+    KernelSpec("m_128x128_k64_mem_iw", 128, 128, 64, "mem", "interwave"),
+]
+
+
+def spec_to_feature_dict(spec: KernelSpec, dtype: str, layout: str) -> dict:
+    """Convert a KernelSpec to the dict format the feature engine expects.
+
+    Note: pad_m/n/k default to True to match KernelConfig defaults and actual
+    compiled kernels. This ensures the ML model receives the correct padding
+    flags that will be used during JIT compilation.
+    """
+    return {
+        "kernel_name": spec.name,
+        "tile_m": spec.tile_m,
+        "tile_n": spec.tile_n,
+        "tile_k": spec.tile_k,
+        "warp_m": spec.wave_m,
+        "warp_n": spec.wave_n,
+        "warp_k": spec.wave_k,
+        "warp_tile_m": spec.warp_m,
+        "warp_tile_n": spec.warp_n,
+        "warp_tile_k": spec.warp_k,
+        "pipeline": spec.pipeline,
+        "scheduler": spec.scheduler,
+        "epilogue": "cshuffle",
+        "pad_m": True,  # Match KernelConfig default
+        "pad_n": True,  # Match KernelConfig default
+        "pad_k": True,  # Match KernelConfig default
+        "persistent": False,
+        "dtype": dtype,
+        "layout": layout,
+    }
+
+
+def spec_to_kernel_config(spec: KernelSpec, dtype: str, arch: str) -> KernelConfig:
+    """Convert a KernelSpec to the dispatcher's KernelConfig for JIT compilation."""
+    return KernelConfig(
+        dtype_a=dtype,
+        dtype_b=dtype,
+        dtype_c=dtype,
+        dtype_acc="fp32",
+        layout_a="row",
+        layout_b="col",
+        layout_c="row",
+        tile_m=spec.tile_m,
+        tile_n=spec.tile_n,
+        tile_k=spec.tile_k,
+        wave_m=spec.wave_m,
+        wave_n=spec.wave_n,
+        wave_k=spec.wave_k,
+        warp_m=spec.warp_m,
+        warp_n=spec.warp_n,
+        warp_k=spec.warp_k,
+        pipeline=spec.pipeline,
+        scheduler=spec.scheduler,
+        epilogue="cshuffle",
+        gfx_arch=arch,
+    )
+
+
+def ml_select_kernel(
+    predictor: Predictor,
+    pool: List[KernelSpec],
+    M: int,
+    N: int,
+    K: int,
+    dtype: str,
+    layout: str,
+) -> tuple:
+    """Score all kernels in the pool and return (best_spec, predicted_tflops)."""
+    problem = {"m": M, "n": N, "k": K, "dtype": dtype, "layout": layout, "split_k": 1}
+    kernel_dicts = [spec_to_feature_dict(s, dtype, layout) for s in pool]
+
+    ranked = predictor.rank_kernels(problem, kernel_dicts)
+    if not ranked:
+        return pool[0], 0.0
+
+    best_name, best_tflops = ranked[0]
+    best_spec = next((s for s in pool if s.name == best_name), pool[0])
+    return best_spec, best_tflops
+
+
+def main():
+    parser = argparse.ArgumentParser(description="ML-based kernel selection for GEMM")
+    parser.add_argument("--dtype", default="fp16", choices=["fp16", "bf16", "fp8"])
+    parser.add_argument("--arch", default="gfx942")
+    parser.add_argument(
+        "--model_dir",
+        default=str(
+            Path(__file__).parent.parent.parent.parent
+            / "heuristics"
+            / "models"
+            / "gemm_universal_fp8_gfx950"
+        ),
+    )
+    parser.add_argument(
+        "--no_run", action="store_true", help="Only predict, don't run GEMMs"
+    )
+    args = parser.parse_args()
+
+    print("=" * 75)
+    print("  Example 09: ML-Based Kernel Selection")
+    print("=" * 75)
+    print(f"\n  Model:  {args.model_dir}")
+    print(f"  Dtype:  {args.dtype}")
+    print(f"  Arch:   {args.arch}")
+    print(f"  Pool:   {len(KERNEL_POOL)} kernels")
+
+    predictor = Predictor(args.model_dir)
+    print("  Model loaded successfully")
+
+    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float16
+
+    test_sizes = [
+        (128, 128, 64),
+        (256, 256, 128),
+        (512, 512, 256),
+        (1024, 1024, 512),
+        (2048, 2048, 1024),
+    ]
+
+    header = f"{'Shape':<20} {'Selected Kernel':<25} {'Pred TFLOPS':>12}"
+    if not args.no_run:
+        header += f" {'Time (ms)':>10} {'TFLOPS':>10} {'Status':<8}"
+    print(f"\n  {header}")
+    print("  " + "-" * len(header))
+
+    results = []
+
+    for M, N, K in test_sizes:
+        t0 = time.time()
+        best_spec, pred_tflops = ml_select_kernel(
+            predictor, KERNEL_POOL, M, N, K, args.dtype, "rcr"
+        )
+        _ = (time.time() - t0) * 1000  # ML selection time (unused)
+
+        size_str = f"{M}x{N}x{K}"
+        line = f"  {size_str:<20} {best_spec.name:<25} {pred_tflops:>12.2f}"
+
+        if args.no_run:
+            print(line)
+            results.append((size_str, best_spec.name, True, 0, pred_tflops))
+            continue
+
+        config = spec_to_kernel_config(best_spec, args.dtype, args.arch)
+
+        setup = setup_gemm_dispatcher(
+            config=config,
+            registry_name=f"ml_{best_spec.name}",
+            verbose=False,
+            auto_rebuild=True,
+        )
+
+        if not setup.success:
+            line += f" {'N/A':>10} {'N/A':>10} {'BUILD':>8}"
+            print(line)
+            results.append((size_str, best_spec.name, False, 0, 0))
+            cleanup_gemm()
+            continue
+
+        dispatcher = setup.dispatcher
+        if not dispatcher.is_supported(M, N, K):
+            line += f" {'N/A':>10} {'N/A':>10} {'UNSUP':>8}"
+            print(line)
+            results.append((size_str, best_spec.name, False, 0, 0))
+            cleanup_gemm()
+            continue
+
+        np.random.seed(42)
+        A = (np.random.randn(M, K) * 0.1).astype(np_dtype)
+        B = (np.random.randn(K, N) * 0.1).astype(np_dtype)
+
+        result = dispatcher.run(A, B, M, N, K)
+
+        if result.success:
+            C_ref = np.matmul(A.astype(np.float32), B.astype(np.float32)).astype(
+                np_dtype
+            )
+            max_err = np.max(np.abs(result.output - C_ref))
+            passed = max_err < 1e-2
+            status = "PASS" if passed else "FAIL"
+            line += f" {result.time_ms:>10.4f} {result.tflops:>10.2f} {status:<8}"
+            results.append(
+                (size_str, best_spec.name, passed, result.time_ms, result.tflops)
+            )
+        else:
+            line += f" {'N/A':>10} {'N/A':>10} {'FAIL':<8}"
+            results.append((size_str, best_spec.name, False, 0, 0))
+
+        print(line)
+        cleanup_gemm()
+
+    # Summary
+    print("\n" + "=" * 75)
+    print("  SUMMARY")
+    print("=" * 75)
+    passed = sum(1 for r in results if r[2])
+    print(f"\n  Results: {passed}/{len(results)} tests passed")
+    valid = [r for r in results if r[2] and r[4] > 0]
+    if valid:
+        avg = sum(r[4] for r in valid) / len(valid)
+        print(f"  Average TFLOPS: {avg:.2f}")
+    if passed == len(results):
+        print("\n  *** ALL TESTS PASSED ***")
+    print("=" * 75)
+    return 0 if passed == len(results) else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/examples/gemm/python/09_multi_registry.py b/dispatcher/examples/gemm/python/09_multi_registry.py
index 97cbce3497..5d9af239d4 100644
--- a/dispatcher/examples/gemm/python/09_multi_registry.py
+++ b/dispatcher/examples/gemm/python/09_multi_registry.py
@@ -8,7 +8,6 @@ Example 09: Multiple Registries
 
 Demonstrates multiple registries for different optimization targets.
 
-Complexity: ★★★★★
 
 Usage:
     python3 09_multi_registry.py
@@ -30,6 +29,7 @@ from ctypes_utils import (
     setup_gemm_dispatcher,
     cleanup_gemm,
     reset_for_example,
+    detect_gpu_arch,
 )
 
 
@@ -50,7 +50,9 @@ Examples:
         help="Data type (default: fp16)",
     )
     parser.add_argument(
-        "--arch", default="gfx942", help="Target architecture (default: gfx942)"
+        "--arch",
+        default=detect_gpu_arch(),
+        help="Target architecture (auto-detected from rocminfo)",
     )
     args = parser.parse_args()
 
diff --git a/dispatcher/examples/gemm/python/10_advanced_benchmark.py b/dispatcher/examples/gemm/python/10_advanced_benchmark.py
index e16e4e271f..b1462478d0 100644
--- a/dispatcher/examples/gemm/python/10_advanced_benchmark.py
+++ b/dispatcher/examples/gemm/python/10_advanced_benchmark.py
@@ -33,6 +33,7 @@ from ctypes_utils import (
     setup_gemm_dispatcher,
     cleanup_gemm,
     reset_for_example,
+    detect_gpu_arch,
 )
 
 
@@ -69,7 +70,11 @@ def parse_args():
     # Kernel configuration
     parser.add_argument("--dtype", default="fp16", help="Data type")
     parser.add_argument("--pipeline", default="compv4", help="Pipeline type")
-    parser.add_argument("--arch", default="gfx942", help="GPU architecture")
+    parser.add_argument(
+        "--arch",
+        default=detect_gpu_arch(),
+        help="GPU architecture (auto-detected from rocminfo)",
+    )
 
     return parser.parse_args()
 
diff --git a/dispatcher/examples/gemm/python/11_json_import.py b/dispatcher/examples/gemm/python/11_json_import.py
index 06743af406..d19395e553 100644
--- a/dispatcher/examples/gemm/python/11_json_import.py
+++ b/dispatcher/examples/gemm/python/11_json_import.py
@@ -15,7 +15,6 @@ Key Features:
   - Use arch_filter validation on loaded configs
   - Export to C++ DECL_KERNEL_SET format
 
-Complexity: ★★★☆☆
 
 Usage:
     python3 11_json_import.py
@@ -45,6 +44,7 @@ from ctypes_utils import (  # noqa: E402
     cleanup_gemm,
     reset_for_example,
     validate_kernel_config,
+    detect_gpu_arch,
 )
 
 # Sample JSON configuration (embedded for demonstration)
@@ -141,8 +141,8 @@ Examples:
     )
     parser.add_argument(
         "--arch",
-        default="gfx942",
-        help="Target GPU architecture (default: gfx942)",
+        default=detect_gpu_arch(),
+        help="Target GPU architecture (auto-detected from rocminfo, override with --arch gfxNNN)",
     )
     args = parser.parse_args()
 
@@ -236,13 +236,13 @@ Examples:
             else:
                 invalid_count += 1
                 if invalid_count <= 3:  # Show first 3 invalid
-                    print(f"\n  ✗ Invalid: {config.kernel_name()}")
+                    print(f"\n  FAIL Invalid: {config.kernel_name()}")
                     for error in result.errors:
                         print(f"    Error: {error}")
 
         print("\n  Validation Summary:")
-        print(f"    ✓ Valid: {valid_count}")
-        print(f"    ✗ Invalid: {invalid_count}")
+        print(f"    OK Valid: {valid_count}")
+        print(f"    FAIL Invalid: {invalid_count}")
         print(f"    Total: {len(configs)}")
 
     # =========================================================================
@@ -275,12 +275,12 @@ Examples:
             disp_config, registry_name="json_import", verbose=False
         )
         if setup.success:
-            print("  ✓ Dispatcher setup successful")
+            print("  OK Dispatcher setup successful")
             print(
                 f"    Kernel header: {setup.kernel_header.name if setup.kernel_header else 'N/A'}"
             )
         else:
-            print(f"  ⚠ Dispatcher setup: {setup.error}")
+            print(f"  WARNING Dispatcher setup: {setup.error}")
             print("    (This is expected if kernels aren't generated)")
 
     # =========================================================================
diff --git a/dispatcher/examples/gemm/python/README.md b/dispatcher/examples/gemm/python/README.md
index 0a83f3533f..07757b951b 100644
--- a/dispatcher/examples/gemm/python/README.md
+++ b/dispatcher/examples/gemm/python/README.md
@@ -295,5 +295,5 @@ Compilation time scales roughly linearly with kernel count.
 ## Related Documentation
 
 - [C++ GEMM Examples](../cpp/README.md)
-- [Python Conv Examples](../../conv/python/README.md)
+- [Python Utilities](../../../python/README.md)
 - [Main Dispatcher README](../../../README.md)
diff --git a/dispatcher/examples/grouped_conv/cpp/01_basic_grouped_conv.cpp b/dispatcher/examples/grouped_conv/cpp/01_basic_grouped_conv.cpp
new file mode 100644
index 0000000000..b503129c57
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/cpp/01_basic_grouped_conv.cpp
@@ -0,0 +1,203 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Example 01: Basic Grouped Convolution
+//
+// Demonstrates three declaration patterns (mirrors GEMM 01):
+// 1. AUTOFILL  - tile + pipeline only, wave/warp auto-filled
+// 2. AUTOCORRECT - invalid wave(1,1,1) corrected to valid config
+// 3. FULL - all parameters explicit (matches validated gfx942 config)
+//
+// Then runs the forward convolution on GPU and verifies output.
+//
+// Build: cd dispatcher/build && cmake .. && make grouped_conv_01_basic
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::grouped_conv_utils;
+using GroupedConvSig  = grouped_conv_decl::GroupedConvSignature;
+using GroupedConvAlgo = grouped_conv_decl::GroupedConvAlgorithm;
+
+using InDataType  = ck_tile::half_t;
+using WeiDataType = ck_tile::half_t;
+using OutDataType = ck_tile::half_t;
+
+// Three declaration patterns -- codegen auto-fills/auto-corrects as needed
+DECL_GROUPED_CONV_KERNEL_SET(
+    basic_conv_kernels,
+    // Pattern 1: AUTOFILL - only tile + pipeline, rest auto-filled
+    .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+         GroupedConvAlgo().tile(1, 128, 128).pipeline("compv4").scheduler("intrawave"),
+         "gfx950")
+        // Pattern 2: AUTOCORRECT - wave(1,1,1) invalid, corrected to (1,4,1)
+        .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+             GroupedConvAlgo()
+                 .tile(1, 64, 64)
+                 .wave(1, 1, 1)
+                 .warp(16, 16, 32)
+                 .pipeline("compv3")
+                 .scheduler("intrawave")
+                 .epilogue("cshuffle")
+                 .vector_sizes(4, 8, 8),
+             "gfx950")
+        // Pattern 3: FULL - all parameters explicit (validated config)
+        .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+             GroupedConvAlgo()
+                 .tile(1, 128, 128)
+                 .wave(2, 2, 1)
+                 .warp(32, 32, 16)
+                 .pipeline("compv3")
+                 .scheduler("intrawave")
+                 .epilogue("cshuffle")
+                 .vector_sizes(4, 8, 8)
+                 .block_per_cu(1),
+             "gfx950"));
+
+int main(int argc, char* argv[])
+{
+    utils::ExampleArgs args("Example 01: Basic Grouped Convolution",
+                            "Declaration patterns + GPU execution");
+    args.add_option("--arch", "gfx950", "GPU architecture");
+    args.add_option("--size", "14", "Spatial size (H=W)");
+    args.add_option("-n", "1", "Batch size");
+    args.add_option("-g", "1", "Groups");
+    args.add_option("-c", "64", "Input channels C");
+    args.add_option("-k", "128", "Output channels K");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    utils::print_header("Example 01: Basic Grouped Convolution");
+
+    std::string gfx_arch = args.get("--arch", "gfx950");
+    int N                = args.get_int("-n", 1);
+    int G                = args.get_int("-g", 1);
+    int C                = args.get_int("-c", 64);
+    int K                = args.get_int("-k", 128);
+    int HW               = args.get_int("--size", 14);
+    int Y = 3, X = 3;
+
+    // Step 1: Show declared kernel sets
+    std::cout << "\nStep 1: Declared Kernel Sets\n";
+    GroupedConvKernelSetRegistry::instance().print();
+
+    // Step 2: Register kernels
+    std::cout << "\nStep 2: Register Kernels\n";
+    GroupedConvRegistry registry;
+    registry.set_name("basic_conv");
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+    std::cout << "  Registered " << registry.size() << " kernel(s)\n";
+
+    // Step 3: Create dispatcher
+    std::cout << "\nStep 3: Create Dispatcher\n";
+    GroupedConvDispatcher dispatcher(&registry);
+
+    // Step 4: Build problem using CK Tile ConvParam
+    std::cout << "\nStep 4: Problem\n";
+    auto problem = create_grouped_conv2d_problem(N, C, K, HW, HW, Y, X, 1, 1);
+    problem.op   = GroupedConvOp::Forward;
+    print_grouped_conv_problem(problem);
+
+    ck_tile::conv::ConvParam conv_param{
+        2,
+        static_cast<ck_tile::index_t>(G),
+        static_cast<ck_tile::index_t>(N),
+        static_cast<ck_tile::index_t>(K),
+        static_cast<ck_tile::index_t>(C),
+        {static_cast<ck_tile::index_t>(Y), static_cast<ck_tile::index_t>(X)},
+        {static_cast<ck_tile::index_t>(HW), static_cast<ck_tile::index_t>(HW)},
+        {1, 1},
+        {1, 1},
+        {1, 1},
+        {1, 1}};
+
+    using InLayout  = ck_tile::tensor_layout::convolution::NHWGC;
+    using WeiLayout = ck_tile::tensor_layout::convolution::GKYXC;
+    using OutLayout = ck_tile::tensor_layout::convolution::NHWGK;
+
+    auto in_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    auto wei_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    auto out_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input_host(in_desc);
+    ck_tile::HostTensor<WeiDataType> weight_host(wei_desc);
+    ck_tile::HostTensor<OutDataType> output_host(out_desc);
+
+    ck_tile::FillUniformDistribution<InDataType>{-0.5f, 0.5f}(input_host);
+    ck_tile::FillUniformDistribution<WeiDataType>{-0.5f, 0.5f}(weight_host);
+
+    ck_tile::DeviceMem input_dev(input_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev(weight_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev(output_host.get_element_space_size_in_bytes());
+
+    input_dev.ToDevice(input_host.data());
+    weight_dev.ToDevice(weight_host.data());
+
+    // Step 5: Select and run
+    std::cout << "\nStep 5: Select and Run\n";
+
+    auto* selected = dispatcher.select_kernel(problem);
+    if(!selected)
+    {
+        std::cerr << "  ERROR: No kernel found for problem!\n";
+        return 1;
+    }
+    std::cout << "  Selected: " << selected->name() << "\n";
+
+    float time_ms = dispatcher.run(input_dev.GetDeviceBuffer(),
+                                   weight_dev.GetDeviceBuffer(),
+                                   output_dev.GetDeviceBuffer(),
+                                   problem,
+                                   nullptr);
+
+    double tflops = calculate_conv_tflops(problem, time_ms);
+    std::cout << "  Time:   " << std::fixed << std::setprecision(4) << time_ms << " ms\n";
+    std::cout << "  TFLOPS: " << std::setprecision(2) << tflops << "\n";
+
+    // Step 6: Verify
+    std::cout << "\nStep 6: Verify\n";
+    output_dev.FromDevice(output_host.data());
+
+    size_t total    = output_host.get_element_space_size();
+    size_t nonzero  = 0;
+    double checksum = 0.0;
+    for(size_t i = 0; i < total; ++i)
+    {
+        float v = static_cast<float>(output_host.data()[i]);
+        if(v != 0.0f)
+            ++nonzero;
+        checksum += v;
+    }
+
+    bool passed = nonzero > 0;
+    std::cout << "  Output elements: " << total << "\n";
+    std::cout << "  Non-zero: " << nonzero << "/" << total
+              << (nonzero > 0 ? " (kernel produced output)" : " WARNING: all zeros!") << "\n";
+    std::cout << "  Checksum: " << std::fixed << std::setprecision(2) << checksum << "\n";
+    std::cout << "  Status: " << (passed ? "PASS" : "FAIL") << "\n";
+
+    utils::print_separator();
+    std::cout << "DECLARATION PATTERNS:\n";
+    std::cout << "  1. AUTOFILL:     tile + pipeline only, wave/warp auto-filled\n";
+    std::cout << "  2. AUTOCORRECT:  invalid wave(1,1,1) corrected\n";
+    std::cout << "  3. FULL:         all parameters explicit\n";
+    utils::print_separator();
+
+    return passed ? 0 : 1;
+}
diff --git a/dispatcher/examples/grouped_conv/cpp/02_all_directions.cpp b/dispatcher/examples/grouped_conv/cpp/02_all_directions.cpp
new file mode 100644
index 0000000000..a2f2b9d560
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/cpp/02_all_directions.cpp
@@ -0,0 +1,216 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Example 02: All Convolution Directions
+//
+// Forward, backward-data, and backward-weight for 2D convolution,
+// each executed on GPU with non-zero verification.
+//
+// Build: cd dispatcher/build && cmake .. && make grouped_conv_02_all_dirs
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::grouped_conv_utils;
+using GroupedConvSig  = grouped_conv_decl::GroupedConvSignature;
+using GroupedConvAlgo = grouped_conv_decl::GroupedConvAlgorithm;
+
+using InDataType  = ck_tile::half_t;
+using WeiDataType = ck_tile::half_t;
+using OutDataType = ck_tile::half_t;
+
+DECL_GROUPED_CONV_KERNEL_SET(
+    conv_fwd_2d,
+    .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+         GroupedConvAlgo().tile(1, 128, 128).pipeline("compv4").vector_sizes(4, 8, 8),
+         "gfx950"));
+
+DECL_GROUPED_CONV_KERNEL_SET(
+    conv_bwdd_2d,
+    .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("bwd_data").dims(2),
+         GroupedConvAlgo().tile(1, 128, 128).pipeline("compv3").vector_sizes(4, 8, 8),
+         "gfx950"));
+
+DECL_GROUPED_CONV_KERNEL_SET(
+    conv_bwdw_2d,
+    .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("bwd_weight").dims(2),
+         GroupedConvAlgo()
+             .tile(1, 128, 128)
+             .pipeline("compv3")
+             .memory_op("atomic_add")
+             .vector_sizes(4, 8, 8),
+         "gfx950"));
+
+int main(int argc, char* argv[])
+{
+    utils::ExampleArgs args("Example 02: All Convolution Directions",
+                            "Forward/BwdData/BwdWeight with GPU execution and verification");
+    args.add_option("--arch", "gfx950", "GPU architecture");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    utils::print_header("Example 02: All Convolution Directions");
+
+    std::string gfx_arch = args.get("--arch", "gfx950");
+
+    GroupedConvRegistry registry;
+    registry.set_name("all_directions");
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+    std::cout << "  Registered " << registry.size() << " kernel(s)\n";
+
+    GroupedConvDispatcher dispatcher(&registry);
+
+    const int N = 1, G = 1, C = 64, K = 128, Hi = 14, Wi = 14, Y = 3, X = 3;
+
+    ck_tile::conv::ConvParam conv_param{
+        2,
+        static_cast<ck_tile::index_t>(G),
+        static_cast<ck_tile::index_t>(N),
+        static_cast<ck_tile::index_t>(K),
+        static_cast<ck_tile::index_t>(C),
+        {static_cast<ck_tile::index_t>(Y), static_cast<ck_tile::index_t>(X)},
+        {static_cast<ck_tile::index_t>(Hi), static_cast<ck_tile::index_t>(Wi)},
+        {1, 1},
+        {1, 1},
+        {1, 1},
+        {1, 1}};
+
+    using InLayout  = ck_tile::tensor_layout::convolution::NHWGC;
+    using WeiLayout = ck_tile::tensor_layout::convolution::GKYXC;
+    using OutLayout = ck_tile::tensor_layout::convolution::NHWGK;
+
+    auto in_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    auto wei_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    auto out_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input(in_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_desc);
+    ck_tile::HostTensor<OutDataType> output(out_desc);
+
+    ck_tile::FillUniformDistribution<InDataType>{-0.5f, 0.5f}(input);
+    ck_tile::FillUniformDistribution<WeiDataType>{-0.5f, 0.5f}(weight);
+
+    ck_tile::DeviceMem input_dev(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev(output.get_element_space_size_in_bytes());
+
+    input_dev.ToDevice(input.data());
+    weight_dev.ToDevice(weight.data());
+
+    std::cout << "\n  " << std::left << std::setw(12) << "Direction" << std::right << std::setw(10)
+              << "Time(ms)" << std::setw(10) << "TFLOPS" << std::setw(14) << "NonZero"
+              << std::setw(10) << "Status" << "\n";
+    std::cout << "  " << std::string(56, '-') << "\n";
+
+    bool all_pass = true;
+
+    auto print_result =
+        [](const char* label, float time_ms, double tflops, size_t nz, size_t total, bool ok) {
+            std::cout << "  " << std::left << std::setw(12) << label << std::right << std::fixed
+                      << std::setprecision(4) << std::setw(10) << time_ms << std::setprecision(2)
+                      << std::setw(10) << tflops << std::setw(14)
+                      << (std::to_string(nz) + "/" + std::to_string(total)) << std::setw(10)
+                      << (ok ? "OK" : "FAIL") << "\n";
+        };
+
+    // Forward: run(X, W, Y)
+    {
+        auto problem =
+            create_grouped_conv2d_problem(N, C, K, Hi, Wi, Y, X, 1, 1, GroupedConvOp::Forward);
+        float time_ms = dispatcher.run(input_dev.GetDeviceBuffer(),
+                                       weight_dev.GetDeviceBuffer(),
+                                       output_dev.GetDeviceBuffer(),
+                                       problem,
+                                       nullptr);
+        output_dev.FromDevice(output.data());
+        size_t nz = 0;
+        for(size_t i = 0; i < output.get_element_space_size(); ++i)
+            if(static_cast<float>(output.data()[i]) != 0.0f)
+                ++nz;
+        bool ok = nz > 0;
+        print_result("forward",
+                     time_ms,
+                     calculate_conv_tflops(problem, time_ms),
+                     nz,
+                     output.get_element_space_size(),
+                     ok);
+        if(!ok)
+            all_pass = false;
+    }
+
+    // Backward Data: run(dY, W, dX)
+    {
+        auto problem =
+            create_grouped_conv2d_problem(N, C, K, Hi, Wi, Y, X, 1, 1, GroupedConvOp::BackwardData);
+        ck_tile::HostTensor<InDataType> dx_host(in_desc);
+        ck_tile::DeviceMem dx_dev(dx_host.get_element_space_size_in_bytes());
+        float time_ms = dispatcher.run(output_dev.GetDeviceBuffer(), // dY (from forward pass)
+                                       weight_dev.GetDeviceBuffer(), // W
+                                       dx_dev.GetDeviceBuffer(),     // dX (output)
+                                       problem,
+                                       nullptr);
+        dx_dev.FromDevice(dx_host.data());
+        size_t nz = 0;
+        for(size_t i = 0; i < dx_host.get_element_space_size(); ++i)
+            if(static_cast<float>(dx_host.data()[i]) != 0.0f)
+                ++nz;
+        bool ok = nz > 0;
+        print_result("bwd_data",
+                     time_ms,
+                     calculate_conv_tflops(problem, time_ms),
+                     nz,
+                     dx_host.get_element_space_size(),
+                     ok);
+        if(!ok)
+            all_pass = false;
+    }
+
+    // Backward Weight: run(X, dY, dW)
+    {
+        auto problem = create_grouped_conv2d_problem(
+            N, C, K, Hi, Wi, Y, X, 1, 1, GroupedConvOp::BackwardWeight);
+        ck_tile::HostTensor<WeiDataType> dw_host(wei_desc);
+        ck_tile::DeviceMem dw_dev(dw_host.get_element_space_size_in_bytes());
+        float time_ms = dispatcher.run(input_dev.GetDeviceBuffer(),  // X
+                                       output_dev.GetDeviceBuffer(), // dY
+                                       dw_dev.GetDeviceBuffer(),     // dW (output)
+                                       problem,
+                                       nullptr);
+        dw_dev.FromDevice(dw_host.data());
+        size_t nz = 0;
+        for(size_t i = 0; i < dw_host.get_element_space_size(); ++i)
+            if(static_cast<float>(dw_host.data()[i]) != 0.0f)
+                ++nz;
+        bool ok = nz > 0;
+        print_result("bwd_weight",
+                     time_ms,
+                     calculate_conv_tflops(problem, time_ms),
+                     nz,
+                     dw_host.get_element_space_size(),
+                     ok);
+        if(!ok)
+            all_pass = false;
+    }
+
+    utils::print_separator();
+    std::cout << "  Status: " << (all_pass ? "PASS" : "FAIL") << "\n";
+    utils::print_separator();
+
+    return all_pass ? 0 : 1;
+}
diff --git a/dispatcher/examples/grouped_conv/cpp/03_benchmark_validation.cpp b/dispatcher/examples/grouped_conv/cpp/03_benchmark_validation.cpp
new file mode 100644
index 0000000000..12bd87d1a4
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/cpp/03_benchmark_validation.cpp
@@ -0,0 +1,263 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Example 03: Benchmark and CPU-Reference Validation
+//
+// Runs a 2D grouped conv forward kernel on the GPU via dispatcher.run()
+// and compares against the CK Tile host reference implementation.
+// Exposes warmup/repeat/log_level as CLI args (matches example 20 pattern).
+//
+// Build: cd dispatcher/build && cmake .. && make grouped_conv_03_bench_val
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <numeric>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+#include "ck_tile/host/reference/reference_grouped_conv_fwd.hpp"
+
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::grouped_conv_utils;
+using GroupedConvSig  = grouped_conv_decl::GroupedConvSignature;
+using GroupedConvAlgo = grouped_conv_decl::GroupedConvAlgorithm;
+
+using InDataType  = ck_tile::half_t;
+using WeiDataType = ck_tile::half_t;
+using OutDataType = ck_tile::half_t;
+using AccDataType = float;
+
+DECL_GROUPED_CONV_KERNEL_SET(
+    bench_kernels,
+    .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+         GroupedConvAlgo().tile(1, 128, 128).pipeline("compv4").vector_sizes(4, 8, 8),
+         "gfx950")
+        .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+             GroupedConvAlgo().tile(1, 64, 64).pipeline("compv3").vector_sizes(4, 8, 8),
+             "gfx950"));
+
+int main(int argc, char* argv[])
+{
+    utils::ExampleArgs args("Example 03: Benchmark & Validation",
+                            "GPU execution with CPU reference validation");
+    args.add_option("-n", "1", "Batch size N");
+    args.add_option("-g", "1", "Groups G");
+    args.add_option("-c", "64", "Input channels C");
+    args.add_option("-k", "128", "Output channels K");
+    args.add_option("--size", "14", "Spatial size (H=W)");
+    args.add_option("--warmup", "3", "Warmup iterations");
+    args.add_option("--repeat", "10", "Benchmark iterations");
+    args.add_option("--arch", "gfx950", "GPU architecture");
+    args.add_flag("--no-verify", "Skip CPU validation");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    utils::print_header("Example 03: Grouped Conv Benchmark & Validation");
+
+    int N  = args.get_int("-n", 1);
+    int G  = args.get_int("-g", 1);
+    int C  = args.get_int("-c", 64);
+    int K  = args.get_int("-k", 128);
+    int Hi = args.get_int("--size", 14);
+    int Wi = Hi;
+    int Y = 3, X = 3;
+    int warmup           = args.get_int("--warmup", 3);
+    int repeat           = args.get_int("--repeat", 10);
+    bool verify          = !args.has("--no-verify");
+    std::string gfx_arch = args.get("--arch", "gfx950");
+
+    std::cout << "\nProblem: N=" << N << " G=" << G << " C=" << C << " K=" << K << " Hi=" << Hi
+              << " Wi=" << Wi << " Y=" << Y << " X=" << X << "\n";
+    std::cout << "Benchmark: warmup=" << warmup << " repeat=" << repeat << "\n";
+
+    // Step 1: Setup tensors using CK Tile descriptors
+    std::cout << "\nStep 1: Setup tensors\n";
+
+    ck_tile::conv::ConvParam conv_param{
+        2,
+        static_cast<ck_tile::index_t>(G),
+        static_cast<ck_tile::index_t>(N),
+        static_cast<ck_tile::index_t>(K),
+        static_cast<ck_tile::index_t>(C),
+        {static_cast<ck_tile::index_t>(Y), static_cast<ck_tile::index_t>(X)},
+        {static_cast<ck_tile::index_t>(Hi), static_cast<ck_tile::index_t>(Wi)},
+        {1, 1},
+        {1, 1},
+        {1, 1},
+        {1, 1}};
+
+    using InLayout  = ck_tile::tensor_layout::convolution::NHWGC;
+    using WeiLayout = ck_tile::tensor_layout::convolution::GKYXC;
+    using OutLayout = ck_tile::tensor_layout::convolution::NHWGK;
+
+    auto in_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    auto wei_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    auto out_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input(in_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_desc);
+    ck_tile::HostTensor<OutDataType> output_gpu(out_desc);
+    ck_tile::HostTensor<OutDataType> output_cpu(out_desc);
+
+    ck_tile::FillUniformDistribution<InDataType>{-0.5f, 0.5f}(input);
+    ck_tile::FillUniformDistribution<WeiDataType>{-0.5f, 0.5f}(weight);
+    output_cpu.SetZero();
+
+    std::cout << "  Input:  " << input.get_element_space_size() << " elements\n";
+    std::cout << "  Weight: " << weight.get_element_space_size() << " elements\n";
+    std::cout << "  Output: " << output_gpu.get_element_space_size() << " elements\n";
+
+    // Step 2: CPU reference
+    if(verify)
+    {
+        std::cout << "\nStep 2: CPU Reference\n";
+
+        std::vector<ck_tile::long_index_t> strides_v    = {1, 1};
+        std::vector<ck_tile::long_index_t> dilations_v  = {1, 1};
+        std::vector<ck_tile::long_index_t> left_pads_v  = {1, 1};
+        std::vector<ck_tile::long_index_t> right_pads_v = {1, 1};
+
+        ck_tile::reference_grouped_conv_fwd<2, InDataType, WeiDataType, OutDataType>(
+            input, weight, output_cpu, strides_v, dilations_v, left_pads_v, right_pads_v);
+
+        std::cout << "  CPU ref[0..7]: ";
+        for(int i = 0; i < std::min(8, static_cast<int>(output_cpu.get_element_space_size())); ++i)
+            std::cout << std::fixed << std::setprecision(4)
+                      << static_cast<float>(output_cpu.data()[i]) << " ";
+        std::cout << "\n";
+    }
+
+    // Step 3: GPU execution via dispatcher
+    std::cout << "\nStep 3: GPU Execution (via dispatcher.run)\n";
+
+    GroupedConvRegistry registry;
+    registry.set_name("bench_val");
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+    std::cout << "  Registered " << registry.size() << " kernel(s)\n";
+
+    GroupedConvDispatcher dispatcher(&registry);
+
+    auto problem = create_grouped_conv2d_problem(N, C, K, Hi, Wi, Y, X, 1, 1);
+    problem.op   = GroupedConvOp::Forward;
+
+    auto* selected = dispatcher.select_kernel(problem);
+    if(!selected)
+    {
+        std::cerr << "  ERROR: No kernel found!\n";
+        return 1;
+    }
+    std::cout << "  Selected: " << selected->name() << "\n";
+
+    ck_tile::DeviceMem input_dev(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev(output_gpu.get_element_space_size_in_bytes());
+
+    input_dev.ToDevice(input.data());
+    weight_dev.ToDevice(weight.data());
+
+    float elapsed_ms = dispatcher.run(input_dev.GetDeviceBuffer(),
+                                      weight_dev.GetDeviceBuffer(),
+                                      output_dev.GetDeviceBuffer(),
+                                      problem,
+                                      nullptr);
+
+    output_dev.FromDevice(output_gpu.data());
+
+    size_t total = output_gpu.get_element_space_size();
+    std::cout << "  GPU out[0..7]: ";
+    for(int i = 0; i < std::min(8, static_cast<int>(total)); ++i)
+        std::cout << std::fixed << std::setprecision(4) << static_cast<float>(output_gpu.data()[i])
+                  << " ";
+    std::cout << "\n";
+
+    size_t nonzero_gpu = 0;
+    double gpu_sum     = 0.0;
+    for(size_t i = 0; i < total; ++i)
+    {
+        float v = static_cast<float>(output_gpu.data()[i]);
+        if(v != 0.0f)
+            ++nonzero_gpu;
+        gpu_sum += v;
+    }
+    std::cout << "  GPU checksum:  " << std::fixed << std::setprecision(6) << gpu_sum << "\n";
+    std::cout << "  GPU non-zero:  " << nonzero_gpu << "/" << total
+              << (nonzero_gpu > 0 ? " (kernel produced output)" : " WARNING: all zeros!") << "\n";
+
+    int Ho        = static_cast<int>(problem.Ho());
+    int Wo        = static_cast<int>(problem.Wo());
+    double flops  = 2.0 * G * N * K * C * Y * X * Ho * Wo;
+    double tflops = flops / (elapsed_ms * 1e9);
+
+    std::cout << "  Time:   " << std::fixed << std::setprecision(4) << elapsed_ms << " ms\n";
+    std::cout << "  TFLOPS: " << std::setprecision(2) << tflops << "\n";
+
+    // Step 4: Validation
+    bool passed = true;
+    if(verify)
+    {
+        std::cout << "\nStep 4: Validation (GPU vs CPU)\n";
+
+        constexpr float rtol = 1e-2f;
+        constexpr float atol = 1e-2f;
+
+        float max_diff      = 0.0f;
+        float max_rel       = 0.0f;
+        size_t max_diff_idx = 0;
+        size_t num_elements = output_gpu.get_element_space_size();
+        size_t mismatches   = 0;
+
+        for(size_t i = 0; i < num_elements; ++i)
+        {
+            float gpu_val = static_cast<float>(output_gpu.data()[i]);
+            float cpu_val = static_cast<float>(output_cpu.data()[i]);
+            float diff    = std::abs(gpu_val - cpu_val);
+            float tol     = atol + rtol * std::abs(cpu_val);
+            float rel     = diff / (std::abs(cpu_val) + 1e-6f);
+            if(diff > max_diff)
+            {
+                max_diff     = diff;
+                max_diff_idx = i;
+            }
+            max_rel = std::max(max_rel, rel);
+            if(diff > tol)
+                ++mismatches;
+        }
+
+        passed = (mismatches == 0);
+
+        std::cout << "  Side-by-side at worst element [" << max_diff_idx << "]:\n";
+        std::cout << "    GPU: " << std::fixed << std::setprecision(6)
+                  << static_cast<float>(output_gpu.data()[max_diff_idx])
+                  << "  CPU: " << static_cast<float>(output_cpu.data()[max_diff_idx])
+                  << "  diff: " << std::scientific << max_diff << "\n";
+        std::cout << "  Elements:     " << num_elements << "\n";
+        std::cout << "  Mismatches:   " << mismatches << "/" << num_elements << "\n";
+        std::cout << "  Max abs diff: " << std::scientific << max_diff << "\n";
+        std::cout << "  Max rel diff: " << std::scientific << max_rel << "\n";
+        std::cout << "  Status:       " << (passed ? "PASSED" : "FAILED") << "\n";
+    }
+
+    utils::print_separator();
+    std::cout << "BENCHMARK & VALIDATION:\n";
+    std::cout << "  GPU kernel:     " << (selected ? selected->name() : "none") << "\n";
+    std::cout << "  Performance:    " << std::fixed << std::setprecision(2) << tflops
+              << " TFLOPS\n";
+    std::cout << "  CPU reference:  reference_grouped_conv_fwd<2>()\n";
+    std::cout << "  Validation:     " << (passed ? "PASS" : "FAIL") << "\n";
+    utils::print_separator();
+
+    return passed ? 0 : 1;
+}
diff --git a/dispatcher/examples/grouped_conv/cpp/04_registry_json.cpp b/dispatcher/examples/grouped_conv/cpp/04_registry_json.cpp
new file mode 100644
index 0000000000..0e5a6d33be
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/cpp/04_registry_json.cpp
@@ -0,0 +1,154 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Example 04: Heuristic Selection + JSON Export
+//
+// Demonstrates runtime kernel selection with heuristic ranking,
+// GPU execution, and JSON registry export.
+//
+// Build: cd dispatcher/build && cmake .. && make grouped_conv_04_registry_json
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::grouped_conv_utils;
+using GroupedConvSig  = grouped_conv_decl::GroupedConvSignature;
+using GroupedConvAlgo = grouped_conv_decl::GroupedConvAlgorithm;
+
+using InDataType  = ck_tile::half_t;
+using WeiDataType = ck_tile::half_t;
+using OutDataType = ck_tile::half_t;
+
+// Two tile configs for heuristic selection
+DECL_GROUPED_CONV_KERNEL_SET(
+    heuristic_kernels,
+    .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+         GroupedConvAlgo().tile(1, 128, 128).pipeline("compv4").vector_sizes(4, 8, 8),
+         "gfx950")
+        .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+             GroupedConvAlgo().tile(1, 64, 64).pipeline("compv3").vector_sizes(4, 8, 8),
+             "gfx950"));
+
+std::vector<std::string> conv_heuristic(const GroupedConvProblem& problem)
+{
+    int64_t spatial = problem.Ho() * problem.Wo();
+    if(spatial > 400)
+        return {"128x128", "64x64"};
+    return {"64x64", "128x128"};
+}
+
+int main(int argc, char* argv[])
+{
+    utils::ExampleArgs args("Example 04: Heuristic + JSON",
+                            "Runtime kernel selection and JSON export");
+    args.add_option("--arch", "gfx950", "GPU architecture");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    utils::print_header("Example 04: Heuristic Selection + JSON Export");
+
+    std::string gfx_arch = args.get("--arch", "gfx950");
+
+    // Step 1: Register
+    std::cout << "\nStep 1: Register Kernels" << std::endl;
+    GroupedConvRegistry registry;
+    registry.set_name("heuristic_conv");
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+    std::cout << "  Registered " << registry.size() << " kernel(s)" << std::endl;
+
+    // Step 2: Heuristic dispatcher
+    std::cout << "\nStep 2: Heuristic Dispatcher" << std::endl;
+    GroupedConvDispatcher dispatcher(&registry);
+    dispatcher.set_strategy(GroupedConvDispatcher::SelectionStrategy::Heuristic);
+    dispatcher.set_heuristic(conv_heuristic);
+
+    // Step 3: Select kernels (no GPU yet)
+    std::cout << "\nStep 3: Kernel Selection" << std::endl;
+
+    auto problem = create_grouped_conv2d_problem(1, 64, 128, 14, 14, 3, 3, 1, 1);
+
+    auto* selected = dispatcher.select_kernel(problem);
+    std::cout << "  Selected: " << (selected ? selected->name() : "none") << std::endl;
+
+    // Step 4: GPU execution
+    std::cout << "\nStep 4: GPU Execution" << std::endl;
+
+    ck_tile::conv::ConvParam cp{
+        2,
+        static_cast<ck_tile::index_t>(1),
+        static_cast<ck_tile::index_t>(1),
+        static_cast<ck_tile::index_t>(128),
+        static_cast<ck_tile::index_t>(64),
+        {static_cast<ck_tile::index_t>(3), static_cast<ck_tile::index_t>(3)},
+        {static_cast<ck_tile::index_t>(14), static_cast<ck_tile::index_t>(14)},
+        {1, 1},
+        {1, 1},
+        {1, 1},
+        {1, 1}};
+
+    using InLayout  = ck_tile::tensor_layout::convolution::NHWGC;
+    using WeiLayout = ck_tile::tensor_layout::convolution::GKYXC;
+    using OutLayout = ck_tile::tensor_layout::convolution::NHWGK;
+
+    std::cout << "  Creating tensors..." << std::endl;
+    auto in_d  = ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(cp);
+    auto wei_d = ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(cp);
+    auto out_d = ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(cp);
+
+    ck_tile::HostTensor<InDataType> input(in_d);
+    ck_tile::HostTensor<WeiDataType> weight(wei_d);
+    ck_tile::HostTensor<OutDataType> output(out_d);
+    ck_tile::FillUniformDistribution<InDataType>{-0.5f, 0.5f}(input);
+    ck_tile::FillUniformDistribution<WeiDataType>{-0.5f, 0.5f}(weight);
+
+    std::cout << "  Allocating device memory..." << std::endl;
+    ck_tile::DeviceMem in_dev(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem wei_dev(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem out_dev(output.get_element_space_size_in_bytes());
+    in_dev.ToDevice(input.data());
+    wei_dev.ToDevice(weight.data());
+
+    std::cout << "  Launching kernel..." << std::endl;
+    float time_ms = dispatcher.run(in_dev.GetDeviceBuffer(),
+                                   wei_dev.GetDeviceBuffer(),
+                                   out_dev.GetDeviceBuffer(),
+                                   problem,
+                                   nullptr);
+
+    std::cout << "  Reading back..." << std::endl;
+    out_dev.FromDevice(output.data());
+    size_t nz = 0;
+    for(size_t i = 0; i < output.get_element_space_size(); ++i)
+        if(static_cast<float>(output.data()[i]) != 0.0f)
+            ++nz;
+
+    std::cout << "  Time:    " << std::fixed << std::setprecision(4) << time_ms << " ms"
+              << std::endl;
+    std::cout << "  TFLOPS:  " << std::setprecision(2) << calculate_conv_tflops(problem, time_ms)
+              << std::endl;
+    std::cout << "  NonZero: " << nz << "/" << output.get_element_space_size() << std::endl;
+
+    // Step 5: JSON export
+    std::cout << "\nStep 5: JSON Export" << std::endl;
+    std::string json = registry.export_json(false);
+    std::cout << "  JSON size: " << json.size() << " bytes" << std::endl;
+
+    bool passed = nz > 0;
+    utils::print_separator();
+    std::cout << "  Status: " << (passed ? "PASS" : "FAIL") << "\n";
+    utils::print_separator();
+
+    return passed ? 0 : 1;
+}
diff --git a/dispatcher/examples/grouped_conv/cpp/05_bwd_data.cpp b/dispatcher/examples/grouped_conv/cpp/05_bwd_data.cpp
new file mode 100644
index 0000000000..35595bb14c
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/cpp/05_bwd_data.cpp
@@ -0,0 +1,183 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Example 05: Backward Data with CPU Reference Validation
+//
+// Computes dX = ConvBwdData(dY, W) on GPU via dispatcher.run()
+// and validates against ck_tile::reference_grouped_conv_bwd_data.
+//
+// Build: cd dispatcher/build && cmake .. && make grouped_conv_05_bwd_data
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+#include "ck_tile/host/reference/reference_grouped_conv_bwd_data.hpp"
+
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::grouped_conv_utils;
+using GroupedConvSig  = grouped_conv_decl::GroupedConvSignature;
+using GroupedConvAlgo = grouped_conv_decl::GroupedConvAlgorithm;
+
+using InDataType  = ck_tile::half_t;
+using WeiDataType = ck_tile::half_t;
+using OutDataType = ck_tile::half_t;
+
+DECL_GROUPED_CONV_KERNEL_SET(
+    bwd_data_kernels,
+    .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("bwd_data").dims(2),
+         GroupedConvAlgo()
+             .tile(1, 128, 128)
+             .pipeline("compv3")
+             .scheduler("intrawave")
+             .vector_sizes(4, 8, 8),
+         "gfx950"));
+
+int main(int argc, char* argv[])
+{
+    utils::ExampleArgs args("Example 05: Backward Data Validation",
+                            "dX = ConvBwdData(dY, W) with CPU reference");
+    args.add_option("--arch", "gfx950", "GPU architecture");
+    args.add_option("-n", "1", "Batch size");
+    args.add_option("-c", "64", "Input channels");
+    args.add_option("-k", "128", "Output channels");
+    args.add_option("--size", "14", "Spatial size (H=W)");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    utils::print_header("Example 05: Backward Data with CPU Validation");
+
+    std::string gfx_arch = args.get("--arch", "gfx950");
+    int N = args.get_int("-n", 1), G = 1;
+    int C = args.get_int("-c", 64), K = args.get_int("-k", 128);
+    int Hi = args.get_int("--size", 14), Wi = Hi, Y = 3, X = 3;
+
+    // Setup
+    ck_tile::conv::ConvParam conv_param{
+        2,
+        static_cast<ck_tile::index_t>(G),
+        static_cast<ck_tile::index_t>(N),
+        static_cast<ck_tile::index_t>(K),
+        static_cast<ck_tile::index_t>(C),
+        {static_cast<ck_tile::index_t>(Y), static_cast<ck_tile::index_t>(X)},
+        {static_cast<ck_tile::index_t>(Hi), static_cast<ck_tile::index_t>(Wi)},
+        {1, 1},
+        {1, 1},
+        {1, 1},
+        {1, 1}};
+
+    using InLayout  = ck_tile::tensor_layout::convolution::NHWGC;
+    using WeiLayout = ck_tile::tensor_layout::convolution::GKYXC;
+    using OutLayout = ck_tile::tensor_layout::convolution::NHWGK;
+
+    auto in_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    auto wei_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    auto out_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    // dY (gradient from next layer) and W (weight) are inputs; dX is output
+    ck_tile::HostTensor<OutDataType> dy(out_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_desc);
+    ck_tile::HostTensor<InDataType> dx_gpu(in_desc);
+    ck_tile::HostTensor<InDataType> dx_cpu(in_desc);
+
+    ck_tile::FillUniformDistribution<OutDataType>{-0.5f, 0.5f}(dy);
+    ck_tile::FillUniformDistribution<WeiDataType>{-0.5f, 0.5f}(weight);
+    dx_cpu.SetZero();
+
+    // CPU reference
+    std::cout << "\nStep 1: CPU Reference (bwd_data)\n";
+    std::vector<ck_tile::long_index_t> strides_v    = {1, 1};
+    std::vector<ck_tile::long_index_t> dilations_v  = {1, 1};
+    std::vector<ck_tile::long_index_t> left_pads_v  = {1, 1};
+    std::vector<ck_tile::long_index_t> right_pads_v = {1, 1};
+
+    ck_tile::reference_grouped_conv_bwd_data<2, InDataType, WeiDataType, OutDataType>(
+        dx_cpu, weight, dy, strides_v, dilations_v, left_pads_v, right_pads_v);
+    std::cout << "  CPU complete\n";
+
+    // GPU execution via dispatcher
+    std::cout << "\nStep 2: GPU Execution (via dispatcher.run)\n";
+
+    GroupedConvRegistry registry;
+    registry.set_name("bwd_data");
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+
+    GroupedConvDispatcher dispatcher(&registry);
+
+    auto problem =
+        create_grouped_conv2d_problem(N, C, K, Hi, Wi, Y, X, 1, 1, GroupedConvOp::BackwardData);
+
+    auto* selected = dispatcher.select_kernel(problem);
+    if(!selected)
+    {
+        std::cerr << "  ERROR: No bwd_data kernel found!\n";
+        return 1;
+    }
+    std::cout << "  Selected: " << selected->name() << "\n";
+
+    ck_tile::DeviceMem dy_dev(dy.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem wei_dev(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dx_dev(dx_gpu.get_element_space_size_in_bytes());
+
+    dy_dev.ToDevice(dy.data());
+    wei_dev.ToDevice(weight.data());
+
+    // dispatcher.run(dY, W, dX, problem) for bwd_data
+    float time_ms = dispatcher.run(dy_dev.GetDeviceBuffer(),
+                                   wei_dev.GetDeviceBuffer(),
+                                   dx_dev.GetDeviceBuffer(),
+                                   problem,
+                                   nullptr);
+
+    dx_dev.FromDevice(dx_gpu.data());
+
+    double tflops = (time_ms > 0) ? calculate_conv_tflops(problem, time_ms) : 0;
+    std::cout << "  Time:   " << std::fixed << std::setprecision(4) << time_ms << " ms\n";
+    std::cout << "  TFLOPS: " << std::setprecision(2) << tflops << "\n";
+
+    // Validation
+    std::cout << "\nStep 3: Validation (GPU vs CPU)\n";
+
+    size_t num_elements = dx_gpu.get_element_space_size();
+    float max_abs = 0, max_rel = 0;
+    size_t mismatches    = 0;
+    constexpr float rtol = 5e-2f, atol = 5e-2f;
+
+    for(size_t i = 0; i < num_elements; ++i)
+    {
+        float gv = static_cast<float>(dx_gpu.data()[i]);
+        float cv = static_cast<float>(dx_cpu.data()[i]);
+        float d  = std::abs(gv - cv);
+        float r  = d / (std::abs(cv) + 1e-6f);
+        max_abs  = std::max(max_abs, d);
+        max_rel  = std::max(max_rel, r);
+        if(d > atol + rtol * std::abs(cv))
+            ++mismatches;
+    }
+
+    bool passed = (mismatches == 0);
+    std::cout << "  Elements:     " << num_elements << "\n";
+    std::cout << "  Mismatches:   " << mismatches << "\n";
+    std::cout << "  Max abs diff: " << std::scientific << max_abs << "\n";
+    std::cout << "  Max rel diff: " << std::scientific << max_rel << "\n";
+
+    utils::print_separator();
+    std::cout << "  dX = ConvBwdData(dY, W)\n";
+    std::cout << "  Status: " << (passed ? "PASS" : "FAIL") << "\n";
+    utils::print_separator();
+
+    return passed ? 0 : 1;
+}
diff --git a/dispatcher/examples/grouped_conv/cpp/06_bwd_weight.cpp b/dispatcher/examples/grouped_conv/cpp/06_bwd_weight.cpp
new file mode 100644
index 0000000000..41cb75aecf
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/cpp/06_bwd_weight.cpp
@@ -0,0 +1,188 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Example 06: Backward Weight with CPU Reference Validation
+//
+// Computes dW = ConvBwdWeight(X, dY) on GPU via dispatcher.run()
+// and validates against ck_tile::reference_grouped_conv_bwd_weight.
+//
+// Build: cd dispatcher/build && cmake .. && make grouped_conv_06_bwd_weight
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+#include "ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp"
+
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::grouped_conv_utils;
+using GroupedConvSig  = grouped_conv_decl::GroupedConvSignature;
+using GroupedConvAlgo = grouped_conv_decl::GroupedConvAlgorithm;
+
+using InDataType  = ck_tile::half_t;
+using WeiDataType = ck_tile::half_t;
+using OutDataType = ck_tile::half_t;
+
+DECL_GROUPED_CONV_KERNEL_SET(
+    bwd_weight_kernels,
+    .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("bwd_weight").dims(2),
+         GroupedConvAlgo()
+             .tile(1, 128, 128)
+             .pipeline("compv3")
+             .scheduler("intrawave")
+             .memory_op("atomic_add")
+             .vector_sizes(4, 8, 8),
+         "gfx950"));
+
+int main(int argc, char* argv[])
+{
+    utils::ExampleArgs args("Example 06: Backward Weight Validation",
+                            "dW = ConvBwdWeight(X, dY) with CPU reference");
+    args.add_option("--arch", "gfx950", "GPU architecture");
+    args.add_option("-n", "1", "Batch size");
+    args.add_option("-c", "64", "Input channels");
+    args.add_option("-k", "128", "Output channels");
+    args.add_option("--size", "14", "Spatial size (H=W)");
+    args.add_option("--split-k", "1", "Split-K factor for bwd_weight (k_batch)");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    utils::print_header("Example 06: Backward Weight with CPU Validation");
+
+    std::string gfx_arch = args.get("--arch", "gfx950");
+    int N = args.get_int("-n", 1), G = 1;
+    int C = args.get_int("-c", 64), K = args.get_int("-k", 128);
+    int Hi = args.get_int("--size", 14), Wi = Hi, Y = 3, X = 3;
+
+    // Setup
+    ck_tile::conv::ConvParam conv_param{
+        2,
+        static_cast<ck_tile::index_t>(G),
+        static_cast<ck_tile::index_t>(N),
+        static_cast<ck_tile::index_t>(K),
+        static_cast<ck_tile::index_t>(C),
+        {static_cast<ck_tile::index_t>(Y), static_cast<ck_tile::index_t>(X)},
+        {static_cast<ck_tile::index_t>(Hi), static_cast<ck_tile::index_t>(Wi)},
+        {1, 1},
+        {1, 1},
+        {1, 1},
+        {1, 1}};
+
+    using InLayout  = ck_tile::tensor_layout::convolution::NHWGC;
+    using WeiLayout = ck_tile::tensor_layout::convolution::GKYXC;
+    using OutLayout = ck_tile::tensor_layout::convolution::NHWGK;
+
+    auto in_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    auto wei_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    auto out_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    // X (input) and dY (gradient) are inputs; dW is output
+    ck_tile::HostTensor<InDataType> input(in_desc);
+    ck_tile::HostTensor<OutDataType> dy(out_desc);
+    ck_tile::HostTensor<WeiDataType> dw_gpu(wei_desc);
+    ck_tile::HostTensor<WeiDataType> dw_cpu(wei_desc);
+
+    ck_tile::FillUniformDistribution<InDataType>{-0.5f, 0.5f}(input);
+    ck_tile::FillUniformDistribution<OutDataType>{-0.5f, 0.5f}(dy);
+    dw_cpu.SetZero();
+
+    // CPU reference
+    std::cout << "\nStep 1: CPU Reference (bwd_weight)\n";
+    std::vector<ck_tile::long_index_t> strides_v    = {1, 1};
+    std::vector<ck_tile::long_index_t> dilations_v  = {1, 1};
+    std::vector<ck_tile::long_index_t> left_pads_v  = {1, 1};
+    std::vector<ck_tile::long_index_t> right_pads_v = {1, 1};
+
+    ck_tile::reference_grouped_conv_bwd_weight<2, InDataType, WeiDataType, OutDataType>(
+        input, dw_cpu, dy, strides_v, dilations_v, left_pads_v, right_pads_v);
+    std::cout << "  CPU complete\n";
+
+    // GPU execution
+    std::cout << "\nStep 2: GPU Execution (via dispatcher.run)\n";
+
+    GroupedConvRegistry registry;
+    registry.set_name("bwd_weight");
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+
+    GroupedConvDispatcher dispatcher(&registry);
+
+    auto problem =
+        create_grouped_conv2d_problem(N, C, K, Hi, Wi, Y, X, 1, 1, GroupedConvOp::BackwardWeight);
+    problem.split_k = args.get_int("--split-k", 1);
+
+    auto* selected = dispatcher.select_kernel(problem);
+    if(!selected)
+    {
+        std::cerr << "  ERROR: No bwd_weight kernel found!\n";
+        return 1;
+    }
+    std::cout << "  Selected: " << selected->name() << "\n";
+
+    ck_tile::DeviceMem in_dev(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dy_dev(dy.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dw_dev(dw_gpu.get_element_space_size_in_bytes());
+
+    in_dev.ToDevice(input.data());
+    dy_dev.ToDevice(dy.data());
+    if(problem.split_k > 1)
+        dw_dev.SetZero();
+
+    // dispatcher.run(X, dY, dW, problem) for bwd_weight
+    float time_ms = dispatcher.run(in_dev.GetDeviceBuffer(),
+                                   dy_dev.GetDeviceBuffer(),
+                                   dw_dev.GetDeviceBuffer(),
+                                   problem,
+                                   nullptr);
+
+    dw_dev.FromDevice(dw_gpu.data());
+
+    double tflops = (time_ms > 0) ? calculate_conv_tflops(problem, time_ms) : 0;
+    std::cout << "  Time:   " << std::fixed << std::setprecision(4) << time_ms << " ms\n";
+    std::cout << "  TFLOPS: " << std::setprecision(2) << tflops << "\n";
+
+    // Validation
+    std::cout << "\nStep 3: Validation (GPU vs CPU)\n";
+
+    size_t num_elements = dw_gpu.get_element_space_size();
+    float max_abs = 0, max_rel = 0;
+    size_t mismatches    = 0;
+    constexpr float rtol = 5e-2f, atol = 5e-2f;
+
+    for(size_t i = 0; i < num_elements; ++i)
+    {
+        float gv = static_cast<float>(dw_gpu.data()[i]);
+        float cv = static_cast<float>(dw_cpu.data()[i]);
+        float d  = std::abs(gv - cv);
+        float r  = d / (std::abs(cv) + 1e-6f);
+        max_abs  = std::max(max_abs, d);
+        max_rel  = std::max(max_rel, r);
+        if(d > atol + rtol * std::abs(cv))
+            ++mismatches;
+    }
+
+    bool passed = (mismatches == 0);
+    std::cout << "  Elements:     " << num_elements << "\n";
+    std::cout << "  Mismatches:   " << mismatches << "\n";
+    std::cout << "  Max abs diff: " << std::scientific << max_abs << "\n";
+    std::cout << "  Max rel diff: " << std::scientific << max_rel << "\n";
+
+    utils::print_separator();
+    std::cout << "  dW = ConvBwdWeight(X, dY)\n";
+    std::cout << "  Status: " << (passed ? "PASS" : "FAIL") << "\n";
+    utils::print_separator();
+
+    return passed ? 0 : 1;
+}
diff --git a/dispatcher/examples/grouped_conv/cpp/07_multi_tile_benchmark.cpp b/dispatcher/examples/grouped_conv/cpp/07_multi_tile_benchmark.cpp
new file mode 100644
index 0000000000..5c95f2c45a
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/cpp/07_multi_tile_benchmark.cpp
@@ -0,0 +1,226 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Example 07: Multi-Tile Benchmark
+//
+// Benchmarks multiple tile configurations across ResNet-like problem sizes.
+// Exposes warmup, repeat, and init method as CLI args (matching CK Tile
+// example 20 patterns).
+//
+// Build: cd dispatcher/build && cmake .. && make grouped_conv_07_benchmark
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::grouped_conv_utils;
+using GroupedConvSig  = grouped_conv_decl::GroupedConvSignature;
+using GroupedConvAlgo = grouped_conv_decl::GroupedConvAlgorithm;
+
+using InDataType  = ck_tile::half_t;
+using WeiDataType = ck_tile::half_t;
+using OutDataType = ck_tile::half_t;
+
+// Multiple tile configurations for benchmarking
+DECL_GROUPED_CONV_KERNEL_SET(
+    benchmark_tiles,
+    // Small tile - compv3
+    .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+         GroupedConvAlgo()
+             .tile(1, 64, 64)
+             .wave(1, 4, 1)
+             .warp(16, 16, 32)
+             .pipeline("compv3")
+             .scheduler("intrawave")
+             .epilogue("cshuffle")
+             .vector_sizes(4, 8, 8)
+             .block_per_cu(1),
+         "gfx950")
+        // Medium tile - compv3
+        .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+             GroupedConvAlgo()
+                 .tile(1, 128, 128)
+                 .wave(2, 2, 1)
+                 .warp(32, 32, 16)
+                 .pipeline("compv3")
+                 .scheduler("intrawave")
+                 .epilogue("cshuffle")
+                 .vector_sizes(4, 8, 8)
+                 .block_per_cu(1),
+             "gfx950")
+        // Large tile - compv4 with double smem buffer
+        .add(GroupedConvSig().dtype("fp16").layout("nhwgc").conv_type("forward").dims(2),
+             GroupedConvAlgo()
+                 .tile(1, 256, 256)
+                 .wave(2, 2, 1)
+                 .warp(32, 32, 16)
+                 .pipeline("compv4")
+                 .scheduler("intrawave")
+                 .epilogue("cshuffle")
+                 .vector_sizes(4, 8, 8)
+                 .block_per_cu(1),
+             "gfx950"));
+
+int main(int argc, char* argv[])
+{
+    utils::ExampleArgs args("Example 07: Multi-Tile Benchmark",
+                            "Multiple tiles across ResNet-like problem sizes");
+    args.add_option("--arch", "gfx950", "GPU architecture");
+    args.add_option("--warmup", "5", "Warmup iterations (passed to stream_config)");
+    args.add_option("--repeat", "20", "Benchmark iterations (passed to stream_config)");
+    args.add_option("--init", "0", "Init method: 0=random, 1=linear, 2=constant(1)");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    utils::print_header("Example 07: Multi-Tile Benchmark");
+
+    std::string gfx_arch = args.get("--arch", "gfx950");
+    int warmup           = args.get_int("--warmup", 5);
+    int repeat           = args.get_int("--repeat", 20);
+    int init_method      = args.get_int("--init", 0);
+
+    std::cout << "\n  Config: warmup=" << warmup << " repeat=" << repeat << " init=" << init_method
+              << "\n";
+
+    GroupedConvRegistry registry;
+    registry.set_name("benchmark");
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+    std::cout << "  Registered " << registry.size() << " kernel(s)\n";
+
+    GroupedConvDispatcher dispatcher(&registry);
+
+    // ResNet-like problem sizes
+    struct BenchProblem
+    {
+        const char* label;
+        int N, C, K, Hi, Wi, Y, X;
+    };
+
+    BenchProblem problems[] = {
+        {"ResNet-stage2", 1, 64, 64, 56, 56, 3, 3},
+        {"ResNet-stage3", 1, 128, 128, 28, 28, 3, 3},
+        {"ResNet-stage4", 1, 256, 256, 14, 14, 3, 3},
+        {"ResNet-stage5", 1, 512, 512, 7, 7, 3, 3},
+        {"Pointwise-1x1", 1, 256, 256, 56, 56, 1, 1},
+        {"Batch-8", 8, 64, 128, 56, 56, 3, 3},
+    };
+
+    std::cout << "\n  " << std::left << std::setw(16) << "Problem" << std::right << std::setw(5)
+              << "N" << std::setw(5) << "C" << std::setw(5) << "K" << std::setw(5) << "H"
+              << std::setw(5) << "W" << std::setw(4) << "F" << std::setw(10) << "Time(ms)"
+              << std::setw(10) << "TFLOPS" << std::setw(10) << "Status" << "\n";
+    std::cout << "  " << std::string(74, '-') << "\n";
+
+    bool all_pass = true;
+    for(const auto& bp : problems)
+    {
+        auto problem =
+            create_grouped_conv2d_problem(bp.N, bp.C, bp.K, bp.Hi, bp.Wi, bp.Y, bp.X, 1, 1);
+        problem.op = GroupedConvOp::Forward;
+
+        ck_tile::conv::ConvParam conv_param{
+            2,
+            static_cast<ck_tile::index_t>(1),
+            static_cast<ck_tile::index_t>(bp.N),
+            static_cast<ck_tile::index_t>(bp.K),
+            static_cast<ck_tile::index_t>(bp.C),
+            {static_cast<ck_tile::index_t>(bp.Y), static_cast<ck_tile::index_t>(bp.X)},
+            {static_cast<ck_tile::index_t>(bp.Hi), static_cast<ck_tile::index_t>(bp.Wi)},
+            {1, 1},
+            {1, 1},
+            {1, 1},
+            {1, 1}};
+
+        using InLayout  = ck_tile::tensor_layout::convolution::NHWGC;
+        using WeiLayout = ck_tile::tensor_layout::convolution::GKYXC;
+        using OutLayout = ck_tile::tensor_layout::convolution::NHWGK;
+
+        auto in_desc =
+            ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+        auto wei_desc =
+            ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+        auto out_desc =
+            ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        ck_tile::HostTensor<InDataType> input(in_desc);
+        ck_tile::HostTensor<WeiDataType> weight(wei_desc);
+        ck_tile::HostTensor<OutDataType> output(out_desc);
+
+        switch(init_method)
+        {
+        case 1:
+            ck_tile::FillMonotonicSeq<InDataType>{0.0f, 0.001f}(input);
+            ck_tile::FillMonotonicSeq<WeiDataType>{0.0f, 0.001f}(weight);
+            break;
+        case 2:
+            ck_tile::FillConstant<InDataType>{1.0f}(input);
+            ck_tile::FillConstant<WeiDataType>{1.0f}(weight);
+            break;
+        default:
+            ck_tile::FillUniformDistribution<InDataType>{-0.5f, 0.5f}(input);
+            ck_tile::FillUniformDistribution<WeiDataType>{-0.5f, 0.5f}(weight);
+            break;
+        }
+        ck_tile::DeviceMem in_dev(input.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem wei_dev(weight.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_dev(output.get_element_space_size_in_bytes());
+
+        in_dev.ToDevice(input.data());
+        wei_dev.ToDevice(weight.data());
+
+        float time_ms = 0;
+        bool ok       = false;
+        try
+        {
+            time_ms = dispatcher.run(in_dev.GetDeviceBuffer(),
+                                     wei_dev.GetDeviceBuffer(),
+                                     out_dev.GetDeviceBuffer(),
+                                     problem,
+                                     nullptr);
+
+            out_dev.FromDevice(output.data());
+            size_t nz = 0;
+            for(size_t j = 0; j < output.get_element_space_size(); ++j)
+                if(static_cast<float>(output.data()[j]) != 0.0f)
+                    ++nz;
+            ok = nz > 0;
+        }
+        catch(const std::exception&)
+        {
+            ok = false;
+        }
+
+        double tflops = (time_ms > 0) ? calculate_conv_tflops(problem, time_ms) : 0;
+
+        std::string filter_str = std::to_string(bp.Y) + "x" + std::to_string(bp.X);
+        std::cout << "  " << std::left << std::setw(16) << bp.label << std::right << std::setw(5)
+                  << bp.N << std::setw(5) << bp.C << std::setw(5) << bp.K << std::setw(5) << bp.Hi
+                  << std::setw(5) << bp.Wi << std::setw(4) << filter_str << std::fixed
+                  << std::setprecision(4) << std::setw(10) << time_ms << std::setprecision(2)
+                  << std::setw(10) << tflops << std::setw(10) << (ok ? "OK" : "FAIL") << "\n";
+        if(!ok)
+            all_pass = false;
+    }
+
+    utils::print_separator();
+    std::cout << "  Warmup: " << warmup << ", Repeat: " << repeat << ", Init: " << init_method
+              << "\n";
+    std::cout << "  Status: " << (all_pass ? "PASS" : "FAIL") << "\n";
+    utils::print_separator();
+
+    return all_pass ? 0 : 1;
+}
diff --git a/dispatcher/examples/grouped_conv/python/01_basic_grouped_conv.py b/dispatcher/examples/grouped_conv/python/01_basic_grouped_conv.py
new file mode 100644
index 0000000000..46f57b3879
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/python/01_basic_grouped_conv.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example 01: Basic Grouped Convolution
+
+Demonstrates:
+1. Three kernel configuration patterns (minimal, explicit, full ConvConfigBase)
+2. Adding kernels to a registry
+3. Validation and auto-correction
+4. JIT compilation via registry.build()
+5. GPU execution with CPU reference verification
+
+Usage:
+    python3 01_basic_grouped_conv.py
+    python3 01_basic_grouped_conv.py --variant bwd_data
+    python3 01_basic_grouped_conv.py --arch gfx942
+"""
+
+import sys
+import argparse
+import time
+import numpy as np
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
+
+from grouped_conv_utils import (
+    GroupedConvKernelConfig,
+    GroupedConvProblem,
+    GroupedConvRegistry,
+    validate_grouped_conv_config,
+    auto_correct_grouped_conv_config,
+    detect_gpu_arch,
+)
+
+
+def cpu_conv2d_fwd(inp, wei, prob):
+    """Naive CPU reference: 2D forward, NHWGC layout."""
+    N, Hi, Wi, G, Cpg = inp.shape
+    _, Kpg, Y, X, _ = wei.shape
+    Ho, Wo = prob.Ho, prob.Wo
+    out = np.zeros((N, Ho, Wo, G, Kpg), dtype=np.float32)
+    for n in range(N):
+        for g in range(G):
+            for ho in range(Ho):
+                for wo in range(Wo):
+                    for k in range(Kpg):
+                        s = 0.0
+                        for y in range(Y):
+                            for x in range(X):
+                                hi = (
+                                    ho * prob.stride_h
+                                    - prob.pad_h
+                                    + y * prob.dilation_h
+                                )
+                                wi = (
+                                    wo * prob.stride_w
+                                    - prob.pad_w
+                                    + x * prob.dilation_w
+                                )
+                                if 0 <= hi < Hi and 0 <= wi < Wi:
+                                    for c in range(Cpg):
+                                        s += float(inp[n, hi, wi, g, c]) * float(
+                                            wei[g, k, y, x, c]
+                                        )
+                        out[n, ho, wo, g, k] = s
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Basic Grouped Conv Example")
+    parser.add_argument("--dtype", default="fp16", choices=["fp16", "bf16"])
+    parser.add_argument(
+        "--variant", default="forward", choices=["forward", "bwd_data", "bwd_weight"]
+    )
+    parser.add_argument("--ndim", type=int, default=2, choices=[2, 3])
+    parser.add_argument("--arch", default=detect_gpu_arch())
+    parser.add_argument(
+        "--workers", type=int, default=0, help="Max JIT workers (0=auto)"
+    )
+    args = parser.parse_args()
+
+    print("=" * 70)
+    print("Example 01: Basic Grouped Convolution")
+    print("=" * 70)
+
+    # =========================================================================
+    # Step 1: Three kernel configuration patterns
+    # =========================================================================
+    print("\n--- Step 1: Kernel Configuration Patterns ---")
+
+    # Pattern 1: MINIMAL -- only variant/dtype/arch, everything else auto-filled
+    config_minimal = GroupedConvKernelConfig(
+        variant=args.variant,
+        ndim_spatial=args.ndim,
+        arch=args.arch,
+        dtype=args.dtype,
+    )
+    print("\n  Pattern 1: MINIMAL (defaults auto-filled)")
+    config_minimal.print_config(indent="    ")
+
+    # Pattern 2: EXPLICIT tile/wave/warp -- user controls tiling strategy
+    config_explicit = GroupedConvKernelConfig(
+        variant=args.variant,
+        ndim_spatial=args.ndim,
+        arch=args.arch,
+        dtype=args.dtype,
+        tile_m=1,
+        tile_n=64,
+        tile_k=64,
+        wave_m=1,
+        wave_n=4,
+        wave_k=1,
+        warp_tile_m=16,
+        warp_tile_n=16,
+        warp_tile_k=32,
+        pipeline="compv3",
+        scheduler="intrawave",
+        epilogue="cshuffle",
+    )
+    print("\n  Pattern 2: EXPLICIT tile/wave/warp")
+    config_explicit.print_config(indent="    ")
+
+    # Pattern 3: FULL ConvConfigBase -- every parameter specified
+    config_full = GroupedConvKernelConfig(
+        variant=args.variant,
+        ndim_spatial=args.ndim,
+        arch=args.arch,
+        dtype=args.dtype,
+        tile_m=1,
+        tile_n=128,
+        tile_k=128,
+        wave_m=2,
+        wave_n=2,
+        wave_k=1,
+        warp_tile_m=32,
+        warp_tile_n=32,
+        warp_tile_k=16,
+        pipeline="compv3",
+        scheduler="intrawave",
+        epilogue="cshuffle",
+        vector_size_a=4,
+        vector_size_b=8,
+        vector_size_c=8,
+        block_per_cu=1,
+        num_wave_groups=1,
+        num_groups_to_merge=1,
+    )
+    print("\n  Pattern 3: FULL (all ConvConfigBase fields)")
+    config_full.print_config(indent="    ")
+
+    # =========================================================================
+    # Step 2: Build a registry with multiple configs
+    # =========================================================================
+    print("\n--- Step 2: Build Registry ---")
+    registry = GroupedConvRegistry("basic_conv")
+    registry.add(config_minimal)
+    registry.add(config_explicit)
+    registry.add(config_full)
+    registry.print_registry()
+
+    # =========================================================================
+    # Step 3: Validate and auto-correct
+    # =========================================================================
+    print("\n--- Step 3: Validate & Auto-Correct ---")
+    for i, cfg in enumerate(registry.kernels):
+        result = validate_grouped_conv_config(cfg.to_dict())
+        if result.is_valid:
+            print(f"  Config [{i}] {cfg.tile_str}: VALID")
+        else:
+            print(f"  Config [{i}] {cfg.tile_str}: needs correction")
+            corrected, result = auto_correct_grouped_conv_config(cfg.to_dict())
+            print(f"    After correction: valid={result.is_valid}")
+
+    # =========================================================================
+    # Step 4: JIT compile via registry.build()
+    # =========================================================================
+    print("\n--- Step 4: JIT Build (via registry.build()) ---")
+
+    # Use only the first config for the actual GPU run
+    jit_reg = GroupedConvRegistry("jit")
+    jit_reg.add(config_minimal)
+
+    workers = args.workers if args.workers > 0 else None
+    t0 = time.perf_counter()
+    runners = jit_reg.build(verbose=False, max_workers=workers)
+    jit_build_s = time.perf_counter() - t0
+
+    key = (args.variant, args.ndim)
+    if key not in runners:
+        print("  JIT build failed")
+        return 1
+    runner = runners[key]
+    print(f"  JIT build: {jit_build_s:.3f} s")
+    print(f"  Library:   {runner.library_path}")
+    print(f"  Kernels:   {runner.lib.kernel_names()}")
+
+    # =========================================================================
+    # Step 5: Define problem + GPU execution
+    # =========================================================================
+    print("\n--- Step 5: GPU Execution ---")
+    prob = GroupedConvProblem(
+        N=1,
+        C=64,
+        K=128,
+        Hi=16,
+        Wi=16,
+        Y=3,
+        X=3,
+        stride_h=1,
+        stride_w=1,
+        pad_h=1,
+        pad_w=1,
+        direction=args.variant,
+    )
+    prob.print_problem()
+
+    inp = np.random.uniform(-0.5, 0.5, prob.input_shape()).astype(np.float16)
+    wei = np.random.uniform(-0.5, 0.5, prob.weight_shape()).astype(np.float16)
+
+    res = runner.run(inp, wei, prob)
+    if not res.success:
+        print(f"  GPU execution failed: {res.error}")
+        runner.cleanup()
+        return 1
+
+    print(f"  Time:   {res.time_ms:.4f} ms")
+    print(f"  TFLOPS: {res.tflops:.2f}")
+    print(
+        f"  Output: shape={res.output.shape}, range=[{res.output.min():.3f}, {res.output.max():.3f}]"
+    )
+
+    # =========================================================================
+    # Step 6: CPU reference (forward 2D only)
+    # =========================================================================
+    verified = False
+    if args.variant == "forward" and args.ndim == 2:
+        print("\n--- Step 6: CPU Reference Verification ---")
+        ref = cpu_conv2d_fwd(inp, wei, prob)
+        gpu_f32 = res.output.astype(np.float32)
+        diff = np.abs(gpu_f32 - ref)
+        max_abs = diff.max()
+        max_rel = (diff / (np.abs(ref) + 1e-6)).max()
+        match = np.allclose(gpu_f32, ref, atol=0.05, rtol=0.05)
+        print(f"  max_abs_diff: {max_abs:.6f}")
+        print(f"  max_rel_diff: {max_rel:.6f}")
+        print(f"  Match: {match}")
+        verified = match
+
+    runner.cleanup()
+
+    # Summary
+    print("\n" + "=" * 70)
+    status = (
+        "PASS" if res.success and (verified or args.variant != "forward") else "FAIL"
+    )
+    print(f"  Status: {status}")
+    print(
+        f"  {config_minimal.name} | {prob.gflops:.2f} GFLOPs | {res.tflops:.2f} TFLOPS"
+    )
+    print(f"  JIT build time: {jit_build_s:.3f} s")
+    print(f"  Registry: {len(registry)} configs (3 patterns demonstrated)")
+    print("=" * 70)
+    return 0 if status == "PASS" else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/examples/grouped_conv/python/02_forward.py b/dispatcher/examples/grouped_conv/python/02_forward.py
new file mode 100644
index 0000000000..8f59db05a1
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/python/02_forward.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example 02: Forward Convolution (2D + 3D)
+
+Declares forward kernels with explicit tile/wave/warp/pipeline parameters,
+builds a registry, JIT compiles, runs on GPU, and validates against CPU reference.
+
+Usage:
+    python3 02_forward.py
+    python3 02_forward.py --arch gfx942
+"""
+
+import sys
+import argparse
+import time
+import numpy as np
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
+
+from grouped_conv_utils import (
+    GroupedConvKernelConfig,
+    GroupedConvProblem,
+    GroupedConvRegistry,
+    detect_gpu_arch,
+)
+
+
+def cpu_conv2d_fwd(inp, wei, prob):
+    """Naive CPU reference: 2D forward, NHWGC layout."""
+    N, Hi, Wi, G, C = inp.shape
+    _, Kpg, Y, X, _ = wei.shape
+    Ho, Wo = prob.Ho, prob.Wo
+    out = np.zeros((N, Ho, Wo, G, Kpg), dtype=np.float32)
+    for n in range(N):
+        for g in range(G):
+            for ho in range(Ho):
+                for wo in range(Wo):
+                    for k in range(Kpg):
+                        s = 0.0
+                        for y in range(Y):
+                            for x in range(X):
+                                hi = ho * prob.stride_h - prob.pad_h + y
+                                wi = wo * prob.stride_w - prob.pad_w + x
+                                if 0 <= hi < Hi and 0 <= wi < Wi:
+                                    for c in range(C):
+                                        s += float(inp[n, hi, wi, g, c]) * float(
+                                            wei[g, k, y, x, c]
+                                        )
+                        out[n, ho, wo, g, k] = s
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Forward Convolution (2D + 3D)")
+    parser.add_argument("--arch", default=detect_gpu_arch())
+    parser.add_argument("--dtype", default="fp16", choices=["fp16", "bf16"])
+    parser.add_argument(
+        "--workers", type=int, default=0, help="Max JIT workers (0=auto)"
+    )
+    args = parser.parse_args()
+
+    arch = args.arch
+    print("=" * 70)
+    print("Example 02: Forward Convolution (2D + 3D)")
+    print("=" * 70)
+    print(f"\n  Arch: {arch}, Dtype: {args.dtype}")
+
+    # =========================================================================
+    # Step 1: Declare forward kernels with explicit parameters
+    # =========================================================================
+    print("\n--- Step 1: Declare Forward Kernels ---")
+    reg = GroupedConvRegistry("forward_conv")
+
+    # Forward 2D: compv4, 128x128 tile, wave 2x2x1, warp 32x32x16
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="forward",
+            ndim_spatial=2,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=128,
+            tile_k=128,
+            wave_m=2,
+            wave_n=2,
+            wave_k=1,
+            warp_tile_m=32,
+            warp_tile_n=32,
+            warp_tile_k=16,
+            pipeline="compv4",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    # Forward 3D: compv3, 64x64 tile, wave 1x4x1, warp 16x16x32
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="forward",
+            ndim_spatial=3,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=64,
+            tile_k=64,
+            wave_m=1,
+            wave_n=4,
+            wave_k=1,
+            warp_tile_m=16,
+            warp_tile_n=16,
+            warp_tile_k=32,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    reg.print_registry()
+
+    # =========================================================================
+    # Step 2: JIT build via registry
+    # =========================================================================
+    print("\n--- Step 2: JIT Build ---")
+    workers = args.workers if args.workers > 0 else None
+    t0 = time.perf_counter()
+    runners = reg.build(verbose=False, max_workers=workers)
+    jit_s = time.perf_counter() - t0
+    print(f"  Built {len(runners)} runners in {jit_s:.1f}s")
+
+    for key in [("forward", 2), ("forward", 3)]:
+        tag = "OK" if key in runners else "FAILED"
+        print(f"  {key[0]} {key[1]}D: {tag}")
+
+    if ("forward", 2) not in runners:
+        print("  ERROR: forward 2D JIT failed")
+        return 1
+
+    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
+
+    # =========================================================================
+    # Step 3: Forward 2D -- GPU + CPU reference
+    # =========================================================================
+    print("\n--- Step 3: Forward 2D ---")
+    prob_2d = GroupedConvProblem(
+        N=1, C=64, K=64, Hi=8, Wi=8, Y=3, X=3, pad_h=1, pad_w=1, direction="forward"
+    )
+    prob_2d.print_problem()
+
+    x = np.random.uniform(-0.5, 0.5, prob_2d.input_shape()).astype(np_dtype)
+    w = np.random.uniform(-0.5, 0.5, prob_2d.weight_shape()).astype(np_dtype)
+
+    res = runners[("forward", 2)].run(x, w, prob_2d)
+    print(f"  Time:   {res.time_ms:.4f} ms")
+    print(f"  TFLOPS: {res.tflops:.2f}")
+    print(
+        f"  Output: shape={res.output.shape}, nonzero={np.count_nonzero(res.output)}/{res.output.size}"
+    )
+
+    ref = cpu_conv2d_fwd(x, w, prob_2d)
+    diff = np.abs(res.output.astype(np.float32) - ref)
+    match_2d = np.allclose(res.output.astype(np.float32), ref, atol=0.05)
+    print(f"  CPU ref: max_abs={diff.max():.6f}, match={match_2d}")
+
+    # =========================================================================
+    # Step 4: Forward 3D -- GPU + non-zero check
+    # =========================================================================
+    ok_3d = True
+    if ("forward", 3) in runners:
+        print("\n--- Step 4: Forward 3D ---")
+        prob_3d = GroupedConvProblem(
+            N=1,
+            C=64,
+            K=64,
+            Di=8,
+            Hi=8,
+            Wi=8,
+            Z=3,
+            Y=3,
+            X=3,
+            pad_d=1,
+            pad_h=1,
+            pad_w=1,
+            direction="forward",
+        )
+        prob_3d.print_problem()
+
+        x3 = np.random.uniform(-0.5, 0.5, prob_3d.input_shape()).astype(np_dtype)
+        w3 = np.random.uniform(-0.5, 0.5, prob_3d.weight_shape()).astype(np_dtype)
+
+        res3 = runners[("forward", 3)].run(x3, w3, prob_3d)
+        nz = np.count_nonzero(res3.output)
+        ok_3d = res3.success and nz > 0
+        print(f"  Time:   {res3.time_ms:.4f} ms")
+        print(f"  TFLOPS: {res3.tflops:.2f}")
+        print(f"  NonZero: {nz}/{res3.output.size}")
+
+    for r in runners.values():
+        r.cleanup()
+
+    passed = res.success and match_2d and ok_3d
+    print("\n" + "=" * 70)
+    print(f"  Forward 2D: {'PASS' if match_2d else 'FAIL'} (CPU validated)")
+    print(f"  Forward 3D: {'PASS' if ok_3d else 'FAIL'} (non-zero check)")
+    print(f"  JIT build:  {jit_s:.1f}s")
+    print(f"  Status:     {'PASS' if passed else 'FAIL'}")
+    print("=" * 70)
+    return 0 if passed else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/examples/grouped_conv/python/03_bwd_data.py b/dispatcher/examples/grouped_conv/python/03_bwd_data.py
new file mode 100644
index 0000000000..a000ba7c96
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/python/03_bwd_data.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example 03: Backward Data Convolution (2D + 3D)
+
+dX = ConvBwdData(dY, W)
+
+Declares backward-data kernels with explicit parameters,
+builds a registry, JIT compiles, runs on GPU, and validates
+against a CPU reference.
+
+Usage:
+    python3 03_bwd_data.py
+"""
+
+import sys
+import argparse
+import time
+import numpy as np
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
+
+from grouped_conv_utils import (
+    GroupedConvKernelConfig,
+    GroupedConvProblem,
+    GroupedConvRegistry,
+    detect_gpu_arch,
+)
+
+
+def cpu_conv2d_bwd_data(dy, wei, prob):
+    """CPU ref: compute dX from dY and W."""
+    N, Ho, Wo, G, Kpg = dy.shape
+    _, _, Y, X, C = wei.shape
+    Hi, Wi = prob.Hi, prob.Wi
+    dx = np.zeros((N, Hi, Wi, G, C), dtype=np.float32)
+    for n in range(N):
+        for g in range(G):
+            for hi in range(Hi):
+                for wi in range(Wi):
+                    for c in range(C):
+                        s = 0.0
+                        for y in range(Y):
+                            for x in range(X):
+                                ho = hi + prob.pad_h - y
+                                wo = wi + prob.pad_w - x
+                                if ho % prob.stride_h == 0 and wo % prob.stride_w == 0:
+                                    ho //= prob.stride_h
+                                    wo //= prob.stride_w
+                                    if 0 <= ho < Ho and 0 <= wo < Wo:
+                                        for k in range(Kpg):
+                                            s += float(dy[n, ho, wo, g, k]) * float(
+                                                wei[g, k, y, x, c]
+                                            )
+                        dx[n, hi, wi, g, c] = s
+    return dx
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backward Data (2D + 3D)")
+    parser.add_argument("--arch", default=detect_gpu_arch())
+    parser.add_argument("--dtype", default="fp16", choices=["fp16", "bf16"])
+    parser.add_argument("--workers", type=int, default=0)
+    args = parser.parse_args()
+
+    arch = args.arch
+    print("=" * 70)
+    print("Example 03: Backward Data Convolution (2D + 3D)")
+    print("=" * 70)
+    print(f"\n  Arch: {arch}, Dtype: {args.dtype}")
+    print("  dX = ConvBwdData(dY, W)")
+
+    # =========================================================================
+    # Step 1: Declare bwd_data kernels
+    # =========================================================================
+    print("\n--- Step 1: Declare BwdData Kernels ---")
+    reg = GroupedConvRegistry("bwd_data_conv")
+
+    # BwdData 2D: compv3, 128x128 tile
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="bwd_data",
+            ndim_spatial=2,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=128,
+            tile_k=128,
+            wave_m=2,
+            wave_n=2,
+            wave_k=1,
+            warp_tile_m=32,
+            warp_tile_n=32,
+            warp_tile_k=16,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    # BwdData 3D: compv3, 64x64 tile
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="bwd_data",
+            ndim_spatial=3,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=64,
+            tile_k=64,
+            wave_m=1,
+            wave_n=4,
+            wave_k=1,
+            warp_tile_m=16,
+            warp_tile_n=16,
+            warp_tile_k=32,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    reg.print_registry()
+
+    # =========================================================================
+    # Step 2: JIT build
+    # =========================================================================
+    print("\n--- Step 2: JIT Build ---")
+    workers = args.workers if args.workers > 0 else None
+    t0 = time.perf_counter()
+    runners = reg.build(verbose=False, max_workers=workers)
+    jit_s = time.perf_counter() - t0
+    print(f"  Built {len(runners)} runners in {jit_s:.1f}s")
+
+    if ("bwd_data", 2) not in runners:
+        print("  ERROR: bwd_data 2D JIT failed")
+        return 1
+
+    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
+
+    # =========================================================================
+    # Step 3: BwdData 2D -- GPU + CPU reference
+    # =========================================================================
+    print("\n--- Step 3: Backward Data 2D ---")
+    prob = GroupedConvProblem(
+        N=1, C=32, K=32, Hi=8, Wi=8, Y=3, X=3, pad_h=1, pad_w=1, direction="bwd_data"
+    )
+    prob.print_problem()
+
+    dy = np.random.uniform(-0.5, 0.5, prob.output_shape()).astype(np_dtype)
+    w = np.random.uniform(-0.5, 0.5, prob.weight_shape()).astype(np_dtype)
+
+    res = runners[("bwd_data", 2)].run(dy, w, prob)
+    print(f"  Time:   {res.time_ms:.4f} ms")
+    print(f"  TFLOPS: {res.tflops:.2f}")
+    print(f"  NonZero: {np.count_nonzero(res.output)}/{res.output.size}")
+
+    ref = cpu_conv2d_bwd_data(dy, w, prob)
+    diff = np.abs(res.output.astype(np.float32) - ref)
+    match_2d = np.allclose(res.output.astype(np.float32), ref, atol=0.1)
+    print(f"  CPU ref: max_abs={diff.max():.6f}, match={match_2d}")
+
+    # =========================================================================
+    # Step 4: BwdData 3D -- GPU + non-zero check
+    # =========================================================================
+    ok_3d = True
+    if ("bwd_data", 3) in runners:
+        print("\n--- Step 4: Backward Data 3D ---")
+        prob3 = GroupedConvProblem(
+            N=1,
+            C=32,
+            K=32,
+            Di=6,
+            Hi=6,
+            Wi=6,
+            Z=3,
+            Y=3,
+            X=3,
+            pad_d=1,
+            pad_h=1,
+            pad_w=1,
+            direction="bwd_data",
+        )
+        dy3 = np.random.uniform(-0.5, 0.5, prob3.output_shape()).astype(np_dtype)
+        w3 = np.random.uniform(-0.5, 0.5, prob3.weight_shape()).astype(np_dtype)
+        res3 = runners[("bwd_data", 3)].run(dy3, w3, prob3)
+        nz = np.count_nonzero(res3.output)
+        ok_3d = res3.success and nz > 0
+        print(f"  Time:   {res3.time_ms:.4f} ms, NonZero: {nz}/{res3.output.size}")
+
+    for r in runners.values():
+        r.cleanup()
+
+    passed = res.success and match_2d and ok_3d
+    print("\n" + "=" * 70)
+    print(f"  BwdData 2D: {'PASS' if match_2d else 'FAIL'} (CPU validated)")
+    print(f"  BwdData 3D: {'PASS' if ok_3d else 'FAIL'}")
+    print(f"  Status:     {'PASS' if passed else 'FAIL'}")
+    print("=" * 70)
+    return 0 if passed else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/examples/grouped_conv/python/04_bwd_weight.py b/dispatcher/examples/grouped_conv/python/04_bwd_weight.py
new file mode 100644
index 0000000000..48e50cd4a9
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/python/04_bwd_weight.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example 04: Backward Weight Convolution (2D + 3D)
+
+dW = ConvBwdWeight(X, dY)
+
+Declares backward-weight kernels with explicit parameters,
+builds a registry, JIT compiles, runs on GPU, and validates
+against a CPU reference.
+
+Usage:
+    python3 04_bwd_weight.py
+"""
+
+import sys
+import argparse
+import time
+import numpy as np
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
+
+from grouped_conv_utils import (
+    GroupedConvKernelConfig,
+    GroupedConvProblem,
+    GroupedConvRegistry,
+    detect_gpu_arch,
+)
+
+
+def cpu_conv2d_bwd_weight(x, dy, prob):
+    """CPU ref: compute dW from X and dY."""
+    N, Hi, Wi, G, C = x.shape
+    _, Ho, Wo, _, Kpg = dy.shape
+    Y, X_ = prob.Y, prob.X
+    dw = np.zeros((G, Kpg, Y, X_, C), dtype=np.float32)
+    for g in range(G):
+        for k in range(Kpg):
+            for y in range(Y):
+                for xf in range(X_):
+                    for c in range(C):
+                        s = 0.0
+                        for n in range(N):
+                            for ho in range(Ho):
+                                for wo in range(Wo):
+                                    hi = ho * prob.stride_h - prob.pad_h + y
+                                    wi = wo * prob.stride_w - prob.pad_w + xf
+                                    if 0 <= hi < Hi and 0 <= wi < Wi:
+                                        s += float(x[n, hi, wi, g, c]) * float(
+                                            dy[n, ho, wo, g, k]
+                                        )
+                        dw[g, k, y, xf, c] = s
+    return dw
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backward Weight (2D + 3D)")
+    parser.add_argument("--arch", default=detect_gpu_arch())
+    parser.add_argument("--dtype", default="fp16", choices=["fp16", "bf16"])
+    parser.add_argument("--workers", type=int, default=0)
+    parser.add_argument(
+        "--split-k", type=int, default=1, help="Split-K factor for bwd_weight (k_batch)"
+    )
+    args = parser.parse_args()
+
+    arch = args.arch
+    print("=" * 70)
+    print("Example 04: Backward Weight Convolution (2D + 3D)")
+    print("=" * 70)
+    print(f"\n  Arch: {arch}, Dtype: {args.dtype}")
+    print("  dW = ConvBwdWeight(X, dY)")
+
+    # =========================================================================
+    # Step 1: Declare bwd_weight kernels
+    # =========================================================================
+    print("\n--- Step 1: Declare BwdWeight Kernels ---")
+    reg = GroupedConvRegistry("bwd_weight_conv")
+
+    # BwdWeight 2D: compv3, 128x128 tile
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="bwd_weight",
+            ndim_spatial=2,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=128,
+            tile_k=128,
+            wave_m=2,
+            wave_n=2,
+            wave_k=1,
+            warp_tile_m=32,
+            warp_tile_n=32,
+            warp_tile_k=16,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    # BwdWeight 3D: compv3, 64x64 tile
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="bwd_weight",
+            ndim_spatial=3,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=64,
+            tile_k=64,
+            wave_m=1,
+            wave_n=4,
+            wave_k=1,
+            warp_tile_m=16,
+            warp_tile_n=16,
+            warp_tile_k=32,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    reg.print_registry()
+
+    # =========================================================================
+    # Step 2: JIT build
+    # =========================================================================
+    print("\n--- Step 2: JIT Build ---")
+    workers = args.workers if args.workers > 0 else None
+    t0 = time.perf_counter()
+    runners = reg.build(verbose=False, max_workers=workers)
+    jit_s = time.perf_counter() - t0
+    print(f"  Built {len(runners)} runners in {jit_s:.1f}s")
+
+    if ("bwd_weight", 2) not in runners:
+        print("  ERROR: bwd_weight 2D JIT failed")
+        return 1
+
+    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
+
+    # =========================================================================
+    # Step 3: BwdWeight 2D -- GPU + CPU reference
+    # =========================================================================
+    print("\n--- Step 3: Backward Weight 2D ---")
+    prob = GroupedConvProblem(
+        N=1,
+        C=32,
+        K=32,
+        Hi=8,
+        Wi=8,
+        Y=3,
+        X=3,
+        pad_h=1,
+        pad_w=1,
+        direction="bwd_weight",
+        split_k=args.split_k,
+    )
+    prob.print_problem()
+
+    x = np.random.uniform(-0.5, 0.5, prob.input_shape()).astype(np_dtype)
+    dy = np.random.uniform(-0.5, 0.5, prob.output_shape()).astype(np_dtype)
+
+    res = runners[("bwd_weight", 2)].run(x, dy, prob)
+    print(f"  Time:   {res.time_ms:.4f} ms")
+    print(f"  TFLOPS: {res.tflops:.2f}")
+    print(f"  NonZero: {np.count_nonzero(res.output)}/{res.output.size}")
+
+    ref = cpu_conv2d_bwd_weight(x, dy, prob)
+    diff = np.abs(res.output.astype(np.float32) - ref)
+    match_2d = np.allclose(res.output.astype(np.float32), ref, atol=0.5)
+    print(f"  CPU ref: max_abs={diff.max():.6f}, match={match_2d}")
+
+    # =========================================================================
+    # Step 4: BwdWeight 3D -- GPU + non-zero check
+    # =========================================================================
+    ok_3d = True
+    if ("bwd_weight", 3) in runners:
+        print("\n--- Step 4: Backward Weight 3D ---")
+        prob3 = GroupedConvProblem(
+            N=1,
+            C=32,
+            K=32,
+            Di=6,
+            Hi=6,
+            Wi=6,
+            Z=3,
+            Y=3,
+            X=3,
+            pad_d=1,
+            pad_h=1,
+            pad_w=1,
+            direction="bwd_weight",
+        )
+        x3 = np.random.uniform(-0.5, 0.5, prob3.input_shape()).astype(np_dtype)
+        dy3 = np.random.uniform(-0.5, 0.5, prob3.output_shape()).astype(np_dtype)
+        res3 = runners[("bwd_weight", 3)].run(x3, dy3, prob3)
+        nz = np.count_nonzero(res3.output)
+        ok_3d = res3.success and nz > 0
+        print(f"  Time:   {res3.time_ms:.4f} ms, NonZero: {nz}/{res3.output.size}")
+
+    for r in runners.values():
+        r.cleanup()
+
+    passed = res.success and match_2d and ok_3d
+    print("\n" + "=" * 70)
+    print(f"  BwdWeight 2D: {'PASS' if match_2d else 'FAIL'} (CPU validated)")
+    print(f"  BwdWeight 3D: {'PASS' if ok_3d else 'FAIL'}")
+    print(f"  Status:       {'PASS' if passed else 'FAIL'}")
+    print("=" * 70)
+    return 0 if passed else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/examples/grouped_conv/python/05_benchmark.py b/dispatcher/examples/grouped_conv/python/05_benchmark.py
new file mode 100644
index 0000000000..9166ab988e
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/python/05_benchmark.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example 05: Multi-Problem GPU Benchmark
+
+Declares kernels with explicit tile/wave/warp/pipeline parameters for
+all directions, builds registries, JIT compiles, and benchmarks across
+ResNet-like problem sizes with configurable warmup/repeat.
+
+Usage:
+    python3 05_benchmark.py
+    python3 05_benchmark.py --warmup 3 --repeat 10
+    python3 05_benchmark.py --workers 4
+"""
+
+import sys
+import argparse
+import time
+import numpy as np
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
+
+from grouped_conv_utils import (
+    GroupedConvKernelConfig,
+    GroupedConvProblem,
+    GroupedConvRegistry,
+    detect_gpu_arch,
+)
+
+
+def compute_bytes(prob, dtype_bytes=2):
+    in_elems = 1
+    for d in prob.input_shape():
+        in_elems *= d
+    wei_elems = 1
+    for d in prob.weight_shape():
+        wei_elems *= d
+    out_elems = 1
+    for d in prob.output_shape():
+        out_elems *= d
+    return (in_elems + wei_elems + out_elems) * dtype_bytes
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Multi-Problem GPU Benchmark")
+    parser.add_argument("--arch", default=detect_gpu_arch())
+    parser.add_argument("--dtype", default="fp16", choices=["fp16", "bf16"])
+    parser.add_argument("--warmup", type=int, default=3, help="Warmup iterations")
+    parser.add_argument("--repeat", type=int, default=5, help="Benchmark iterations")
+    parser.add_argument(
+        "--workers", type=int, default=0, help="Max JIT workers (0=auto)"
+    )
+    args = parser.parse_args()
+
+    print("=" * 70)
+    print("Example 05: Multi-Problem GPU Benchmark")
+    print("=" * 70)
+    print(f"\n  Arch: {args.arch}, Dtype: {args.dtype}")
+    print(f"  Warmup: {args.warmup}, Repeat: {args.repeat}")
+
+    # =========================================================================
+    # Step 1: Declare all kernels with explicit parameters
+    # =========================================================================
+    print("\n--- Step 1: Declare Kernels ---")
+    reg = GroupedConvRegistry("benchmark")
+
+    # Forward 2D: compv4, 128x128 tile
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="forward",
+            ndim_spatial=2,
+            arch=args.arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=128,
+            tile_k=128,
+            wave_m=2,
+            wave_n=2,
+            wave_k=1,
+            warp_tile_m=32,
+            warp_tile_n=32,
+            warp_tile_k=16,
+            pipeline="compv4",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    # Forward 3D: compv3, 64x64 tile
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="forward",
+            ndim_spatial=3,
+            arch=args.arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=64,
+            tile_k=64,
+            wave_m=1,
+            wave_n=4,
+            wave_k=1,
+            warp_tile_m=16,
+            warp_tile_n=16,
+            warp_tile_k=32,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    # BwdData 2D: compv3, 128x128 tile
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="bwd_data",
+            ndim_spatial=2,
+            arch=args.arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=128,
+            tile_k=128,
+            wave_m=2,
+            wave_n=2,
+            wave_k=1,
+            warp_tile_m=32,
+            warp_tile_n=32,
+            warp_tile_k=16,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    # BwdWeight 2D: compv3, 128x128 tile
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="bwd_weight",
+            ndim_spatial=2,
+            arch=args.arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=128,
+            tile_k=128,
+            wave_m=2,
+            wave_n=2,
+            wave_k=1,
+            warp_tile_m=32,
+            warp_tile_n=32,
+            warp_tile_k=16,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+        )
+    )
+    reg.print_registry()
+
+    # =========================================================================
+    # Step 2: JIT build
+    # =========================================================================
+    print("\n--- Step 2: JIT Build ---")
+    workers = args.workers if args.workers > 0 else None
+    t0 = time.perf_counter()
+    runner_by_key = reg.build(verbose=False, max_workers=workers)
+    jit_s = time.perf_counter() - t0
+
+    for key in [("forward", 2), ("forward", 3), ("bwd_data", 2), ("bwd_weight", 2)]:
+        tag = "OK" if key in runner_by_key else "FAILED"
+        print(f"  {key[0]:12s} {key[1]}D: {tag}")
+    print(f"  JIT build time: {jit_s:.3f} s")
+
+    missing = [
+        k
+        for k in [("forward", 2), ("forward", 3), ("bwd_data", 2), ("bwd_weight", 2)]
+        if k not in runner_by_key
+    ]
+    if missing:
+        print(f"\n  ERROR: missing {missing}")
+        return 1
+
+    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
+
+    def bench_run(runner, inp, wei, prob):
+        for _ in range(args.warmup):
+            runner.run(inp, wei, prob)
+        times = []
+        for _ in range(args.repeat):
+            r = runner.run(inp, wei, prob)
+            if r.success:
+                times.append(r.time_ms)
+        if not times:
+            return 0.0, 0.0
+        return min(times), sum(times) / len(times)
+
+    # =========================================================================
+    # Step 3: 2D Forward benchmark
+    # =========================================================================
+    print("\n--- Step 3: Forward 2D Benchmark ---")
+    print(
+        f"{'Problem':<18} {'N':>3} {'C':>4} {'K':>4} {'H':>3} {'W':>3} "
+        f"{'F':>3} {'Min(ms)':>9} {'Avg(ms)':>9} {'TFLOPS':>8} {'GB/s':>8}"
+    )
+    print("-" * 85)
+
+    all_ok = True
+    for label, n, c, k, h, w, y, x, s, p in [
+        ("ResNet-stage2", 1, 64, 64, 56, 56, 3, 3, 1, 1),
+        ("ResNet-stage3", 1, 128, 128, 28, 28, 3, 3, 1, 1),
+        ("ResNet-stage4", 1, 256, 256, 14, 14, 3, 3, 1, 1),
+        ("ResNet-stage5", 1, 512, 512, 7, 7, 3, 3, 1, 1),
+        ("Pointwise-1x1", 1, 256, 256, 56, 56, 1, 1, 1, 0),
+        ("Batch-8", 8, 64, 128, 56, 56, 3, 3, 1, 1),
+        ("Batch-32", 32, 64, 128, 56, 56, 3, 3, 1, 1),
+    ]:
+        prob = GroupedConvProblem(
+            N=n,
+            C=c,
+            K=k,
+            Hi=h,
+            Wi=w,
+            Y=y,
+            X=x,
+            stride_h=s,
+            stride_w=s,
+            pad_h=p,
+            pad_w=p,
+            direction="forward",
+        )
+        inp = np.random.uniform(-0.3, 0.3, prob.input_shape()).astype(np_dtype)
+        wei = np.random.uniform(-0.3, 0.3, prob.weight_shape()).astype(np_dtype)
+        min_ms, avg_ms = bench_run(runner_by_key[("forward", 2)], inp, wei, prob)
+        if avg_ms > 0:
+            tflops = prob.flops / (avg_ms * 1e9)
+            bw = compute_bytes(prob) / (avg_ms * 1e6)
+            print(
+                f"{label:<18} {n:>3} {c:>4} {k:>4} {h:>3} {w:>3} "
+                f"{y}x{x} {min_ms:>9.4f} {avg_ms:>9.4f} {tflops:>8.2f} {bw:>8.1f}"
+            )
+        else:
+            all_ok = False
+
+    # =========================================================================
+    # Step 4: 3D Forward
+    # =========================================================================
+    print("\n--- Step 4: Forward 3D ---")
+    for label, n, c, k, d, h, w, z, y, x in [
+        ("3D-small", 1, 64, 64, 8, 16, 16, 3, 3, 3),
+        ("3D-medium", 1, 64, 128, 16, 32, 32, 3, 3, 3),
+    ]:
+        prob = GroupedConvProblem(
+            N=n, C=c, K=k, Di=d, Hi=h, Wi=w, Z=z, Y=y, X=x, direction="forward"
+        )
+        inp = np.random.uniform(-0.3, 0.3, prob.input_shape()).astype(np_dtype)
+        wei = np.random.uniform(-0.3, 0.3, prob.weight_shape()).astype(np_dtype)
+        min_ms, avg_ms = bench_run(runner_by_key[("forward", 3)], inp, wei, prob)
+        if avg_ms > 0:
+            tflops = prob.flops / (avg_ms * 1e9)
+            print(f"  {label:<14} {min_ms:.4f} / {avg_ms:.4f} ms  {tflops:.2f} TFLOPS")
+
+    # =========================================================================
+    # Step 5: Backward directions
+    # =========================================================================
+    print("\n--- Step 5: Backward Directions ---")
+    for label, direction in [
+        ("bwd_data ResNet-s3", "bwd_data"),
+        ("bwd_weight ResNet-s3", "bwd_weight"),
+    ]:
+        prob = GroupedConvProblem(
+            N=1,
+            C=128,
+            K=128,
+            Hi=28,
+            Wi=28,
+            Y=3,
+            X=3,
+            stride_h=1,
+            stride_w=1,
+            pad_h=1,
+            pad_w=1,
+            direction=direction,
+        )
+        inp = np.random.uniform(-0.3, 0.3, prob.input_shape()).astype(np_dtype)
+        wei = np.random.uniform(-0.3, 0.3, prob.weight_shape()).astype(np_dtype)
+        min_ms, avg_ms = bench_run(runner_by_key[(direction, 2)], inp, wei, prob)
+        if avg_ms > 0:
+            tflops = prob.flops / (avg_ms * 1e9)
+            print(
+                f"  {label:<14} {direction:>12} {min_ms:.4f} / {avg_ms:.4f} ms  {tflops:.2f} TFLOPS"
+            )
+
+    for runner in runner_by_key.values():
+        runner.cleanup()
+
+    print("\n" + "=" * 70)
+    print(f"  JIT build:  {jit_s:.3f} s")
+    print(f"  Warmup: {args.warmup}, Repeat: {args.repeat}")
+    print(f"  Status: {'PASS' if all_ok else 'FAIL'}")
+    print("=" * 70)
+    return 0 if all_ok else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/examples/grouped_conv/python/06_registry_json.py b/dispatcher/examples/grouped_conv/python/06_registry_json.py
new file mode 100644
index 0000000000..1a3dc854e7
--- /dev/null
+++ b/dispatcher/examples/grouped_conv/python/06_registry_json.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example 06: Registry, Heuristic Selection & JSON Export
+
+Declares multiple kernel configurations with different tile sizes,
+builds a registry, demonstrates heuristic runtime kernel selection,
+JSON round-trip, and GPU execution.
+
+Usage:
+    python3 06_registry_json.py
+    python3 06_registry_json.py --workers 4
+"""
+
+import sys
+import time
+import argparse
+import numpy as np
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
+
+from grouped_conv_utils import (
+    GroupedConvKernelConfig,
+    GroupedConvProblem,
+    GroupedConvRegistry,
+    detect_gpu_arch,
+)
+
+
+def conv_heuristic(problem):
+    spatial = problem.Ho * problem.Wo
+    if spatial > 400:
+        return ["256", "128", "64"]
+    return ["64", "128", "256"]
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Registry, Heuristic & JSON")
+    parser.add_argument("--arch", default=detect_gpu_arch())
+    parser.add_argument("--dtype", default="fp16", choices=["fp16", "bf16"])
+    parser.add_argument("--workers", type=int, default=0)
+    args = parser.parse_args()
+
+    arch = args.arch
+    print("=" * 70)
+    print("Example 06: Registry, Heuristic Selection & JSON Export")
+    print("=" * 70)
+    print(f"\n  Arch: {arch}, Dtype: {args.dtype}")
+
+    # Step 1: Declare kernels with full explicit parameters
+    print("\n--- Step 1: Declare Kernels + Build Registry ---")
+    reg = GroupedConvRegistry("conv_tiles")
+
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="forward",
+            ndim_spatial=2,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=256,
+            tile_k=256,
+            wave_m=2,
+            wave_n=2,
+            wave_k=1,
+            warp_tile_m=32,
+            warp_tile_n=32,
+            warp_tile_k=16,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+            num_wave_groups=1,
+            num_groups_to_merge=1,
+        )
+    )
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="forward",
+            ndim_spatial=2,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=128,
+            tile_k=128,
+            wave_m=2,
+            wave_n=2,
+            wave_k=1,
+            warp_tile_m=32,
+            warp_tile_n=32,
+            warp_tile_k=16,
+            pipeline="compv4",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+            num_wave_groups=1,
+            num_groups_to_merge=1,
+        )
+    )
+    reg.add(
+        GroupedConvKernelConfig(
+            variant="forward",
+            ndim_spatial=2,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=64,
+            tile_k=64,
+            wave_m=1,
+            wave_n=4,
+            wave_k=1,
+            warp_tile_m=16,
+            warp_tile_n=16,
+            warp_tile_k=32,
+            pipeline="compv3",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+            block_per_cu=1,
+            num_wave_groups=1,
+            num_groups_to_merge=1,
+        )
+    )
+    reg.print_registry()
+
+    # Step 2: Heuristic kernel selection
+    print("\n--- Step 2: Heuristic Kernel Selection ---")
+    problems = [
+        (
+            "small_7x7",
+            GroupedConvProblem(
+                N=1,
+                C=512,
+                K=512,
+                Hi=7,
+                Wi=7,
+                Y=3,
+                X=3,
+                pad_h=1,
+                pad_w=1,
+                direction="forward",
+            ),
+        ),
+        (
+            "medium_14x14",
+            GroupedConvProblem(
+                N=1,
+                C=256,
+                K=256,
+                Hi=14,
+                Wi=14,
+                Y=3,
+                X=3,
+                pad_h=1,
+                pad_w=1,
+                direction="forward",
+            ),
+        ),
+        (
+            "large_56x56",
+            GroupedConvProblem(
+                N=1,
+                C=64,
+                K=128,
+                Hi=56,
+                Wi=56,
+                Y=3,
+                X=3,
+                pad_h=1,
+                pad_w=1,
+                direction="forward",
+            ),
+        ),
+    ]
+    print(f"  {'Problem':<16} {'Spatial':>8} {'Selected Kernel':<50}")
+    print(f"  {'-' * 74}")
+    for label, prob in problems:
+        selected = reg.select(prob, heuristic=conv_heuristic)
+        spatial = prob.Ho * prob.Wo
+        sel_name = selected.name if selected else "none"
+        print(f"  {label:<16} {spatial:>8} {sel_name:<50}")
+
+    # Step 3: JSON round-trip
+    print("\n--- Step 3: JSON Round-Trip ---")
+    json_str = reg.to_json()
+    print(f"  Exported: {len(json_str)} bytes, {len(reg)} kernels")
+    imported = GroupedConvRegistry.from_json(json_str)
+    print(f"  Imported: {len(imported)} kernels")
+    orig = reg.kernels[0]
+    imp = imported.kernels[0]
+    rt_ok = (
+        orig.vector_size_a == imp.vector_size_a
+        and orig.block_per_cu == imp.block_per_cu
+        and orig.tile_n == imp.tile_n
+    )
+    print(f"  Full fields round-trip: {'OK' if rt_ok else 'FAIL'}")
+
+    # Step 4: JIT build + GPU execution
+    print("\n--- Step 4: JIT Build + GPU Execution ---")
+    workers = args.workers if args.workers > 0 else None
+    jit_reg = GroupedConvRegistry("jit_conv")
+    jit_reg.add(
+        GroupedConvKernelConfig(
+            variant="forward",
+            ndim_spatial=2,
+            arch=arch,
+            dtype=args.dtype,
+            tile_m=1,
+            tile_n=128,
+            tile_k=128,
+            wave_m=2,
+            wave_n=2,
+            wave_k=1,
+            warp_tile_m=32,
+            warp_tile_n=32,
+            warp_tile_k=16,
+            pipeline="compv4",
+            scheduler="intrawave",
+            epilogue="cshuffle",
+            vector_size_a=4,
+            vector_size_b=8,
+            vector_size_c=8,
+        )
+    )
+    t0 = time.perf_counter()
+    runners = jit_reg.build(verbose=False, max_workers=workers)
+    jit_s = time.perf_counter() - t0
+
+    if ("forward", 2) not in runners:
+        print("  JIT build failed")
+        return 1
+    runner = runners[("forward", 2)]
+    print(f"  JIT build: {jit_s:.3f} s")
+    print(f"  Library:   {runner.library_path}")
+
+    prob = GroupedConvProblem(
+        N=1, C=128, K=128, Hi=16, Wi=16, Y=3, X=3, pad_h=1, pad_w=1, direction="forward"
+    )
+    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
+    inp = np.random.uniform(-0.3, 0.3, prob.input_shape()).astype(np_dtype)
+    wei = np.random.uniform(-0.3, 0.3, prob.weight_shape()).astype(np_dtype)
+    res = runner.run(inp, wei, prob)
+    runner.cleanup()
+
+    if res.success:
+        print(f"  Time:    {res.time_ms:.4f} ms")
+        print(f"  TFLOPS:  {res.tflops:.2f}")
+        print(f"  NonZero: {np.count_nonzero(res.output)}/{res.output.size}")
+
+    gpu_ok = res.success
+    print("\n" + "=" * 70)
+    print(f"  Registry:   {len(reg)} kernels (3 tile configs)")
+    print("  Heuristic:  spatial-based selection demonstrated")
+    print(f"  JSON:       round-trip {'OK' if rt_ok else 'FAIL'}")
+    print(f"  GPU:        {'OK' if gpu_ok else 'FAIL'}")
+    print(f"  Status:     {'PASS' if gpu_ok and rt_ok else 'FAIL'}")
+    print("=" * 70)
+    return 0 if gpu_ok and rt_ok else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/heuristics/.gitignore b/dispatcher/heuristics/.gitignore
new file mode 100644
index 0000000000..d9523255bf
--- /dev/null
+++ b/dispatcher/heuristics/.gitignore
@@ -0,0 +1,60 @@
+# Python bytecode and caches
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+
+# Jupyter notebooks
+*.ipynb
+.ipynb_checkpoints/
+
+# Virtual environments
+.venv/
+venv/
+ENV/
+
+# IDE and editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Test output and logs
+*.log
+test_output.log
+custom_shapes_gpu_test.log
+
+# Benchmark and analysis output files
+*.csv
+*.json
+!models/*/feature_spec.json
+!models/*/train_manifest.json
+
+# Data files (parquet, arrow)
+*.parquet
+*.arrow
+
+# Temporary and NFS files
+.nfs*
+*.tmp
+*.bak
+
+# Decompressed model files (compressed .lgbm.gz versions are tracked)
+models/**/*.lgbm
+
+# User-specific test and analysis scripts
+test_*.py
+!tests/test_*.py
+find_*.py
+oracle_*.json
+validation_results_*.csv
+custom_shapes_*.csv
+fp16_bf16_*.csv
+
+# Ignore all markdown files except tracked documentation
+*.md
+!DATA_GENERATION.md
+!LEARNINGS.md
+!README.md
diff --git a/dispatcher/heuristics/DATA_GENERATION.md b/dispatcher/heuristics/DATA_GENERATION.md
new file mode 100644
index 0000000000..819e77fe48
--- /dev/null
+++ b/dispatcher/heuristics/DATA_GENERATION.md
@@ -0,0 +1,412 @@
+# Data Generation Guide
+
+This document explains how to build benchmark binaries from the CK Tile engine,
+generate benchmark datasets, and manage them for the ML kernel performance
+prediction system.
+
+## Overview
+
+The ML heuristic needs benchmark data: measured TFLOPS, latency, and bandwidth
+for every (problem shape, kernel config) pair. The tile engine builds one
+executable per kernel configuration. Each executable benchmarks a single kernel
+on a given problem size and outputs JSON with performance metrics.
+
+```
+CK source  -->  CMake configure  -->  ninja build  -->  benchmark binaries
+                                                          (4608 per op/dtype/layout)
+
+benchmark binaries  -->  run on GPU  -->  streaming log  -->  parquet dataset
+                          (per shape)       (JSON blocks)      (canonical schema)
+```
+
+## Prerequisites
+
+- **ROCm**: HIP >= 6.0.3 (for gfx950: HIP >= 6.0.4)
+- **Build tools**: CMake >= 3.21, Ninja, HIP-aware clang compiler
+- **Python**: 3.10+ with `pandas`, `pyarrow`
+- **GPU**: ROCm-capable AMD GPU (MI250X, MI300X, MI355X, etc.)
+
+---
+
+## Part 1: Building Benchmark Binaries from the Tile Engine
+
+If you already have pre-built binaries (e.g., in `/workspace/ck_tile/bin/`),
+skip to Part 2. This section explains how to build them from source.
+
+### Step 1: CMake Configure
+
+From the CK repository root:
+
+```bash
+cmake -S /workspace/rocm-libraries/projects/composablekernel \
+      -B build \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DGPU_TARGETS="gfx950" \
+      -DGEMM_UNIVERSAL_DATATYPE="fp8" \
+      -DGEMM_UNIVERSAL_LAYOUT="rcr" \
+      -G Ninja
+```
+
+**Key CMake variables:**
+
+| Variable | Default | Description |
+|---|---|---|
+| `GPU_TARGETS` | (required) | Target GPU architectures. Supported: `gfx90a`, `gfx942`, `gfx950`, `gfx1201`. Semicolon-separated for multiple. |
+| `GEMM_UNIVERSAL_DATATYPE` | `"fp8;fp16"` | Data types to build. Options: `fp8`, `fp16`, `bf16`, `bf8`. Semicolon-separated. |
+| `GEMM_UNIVERSAL_LAYOUT` | `"rcr;rrr;crr;ccr"` | Layouts to build. Semicolon-separated. |
+| `GEMM_UNIVERSAL_CONFIG_FILE` | `"default_config.json"` | Kernel config file (in the `configs/` directory). Controls which tile sizes, warp configs, pipelines, etc. are enumerated. |
+| `ENABLE_CCACHE_GEMM_UNIVERSAL` | `OFF` | Enable ccache for faster rebuilds. |
+
+**Example: build only fp8 RCR for gfx950 (fastest, ~4608 kernels):**
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release \
+      -DGPU_TARGETS="gfx950" \
+      -DGEMM_UNIVERSAL_DATATYPE="fp8" \
+      -DGEMM_UNIVERSAL_LAYOUT="rcr" \
+      -G Ninja
+```
+
+**Example: build all dtypes and layouts (slow, ~4608 * 4 * 4 = ~73K kernels):**
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release \
+      -DGPU_TARGETS="gfx950" \
+      -DGEMM_UNIVERSAL_DATATYPE="fp8;fp16;bf16;bf8" \
+      -DGEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" \
+      -G Ninja
+```
+
+### What happens during configure
+
+1. CMake calls `gemm_universal_instance_builder.py --list_kernels` to enumerate
+   all valid kernel configurations from the config JSON.
+2. It writes `gemm_universal_kernel_list.txt` (one kernel per line) and
+   `gemm_universal_kernel_count.txt` to the build directory.
+3. For each kernel, it creates a ninja build target.
+
+### Step 2: Build
+
+```bash
+# Build all benchmarks for the configured dtypes/layouts
+ninja -C build benchmark_gemm_universal_all
+
+# Or build a specific dtype/layout combo
+ninja -C build benchmark_gemm_universal_fp8_rcr
+
+# Or build by pipeline type
+ninja -C build benchmark_gemm_universal_compv4_pipeline
+ninja -C build benchmark_gemm_universal_mem_pipeline
+
+# Or build a single specific kernel
+ninja -C build benchmark_gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128
+```
+
+**Build time estimates:**
+- ~4608 kernels (one dtype, one layout): 1-4 hours depending on CPU cores
+- Use `-j <N>` to control parallelism: `ninja -C build -j 32 benchmark_gemm_universal_fp8_rcr`
+
+### Step 3: Verify binaries
+
+Binaries are placed in `build/bin/`:
+
+```bash
+ls build/bin/benchmark_gemm_universal_fp8_rcr_* | wc -l
+# Expected: 4608 (for default config)
+
+# Test one binary
+./build/bin/benchmark_gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128 \
+    -m=1024 -n=1024 -k=1024 -warmup=3 -repeat=10 -verify=0
+```
+
+### Kernel config files
+
+The config files live in:
+```
+tile_engine/ops/gemm/gemm_universal/configs/
+  default_config.json       # Default: full enumeration
+  default_ci_config.json    # CI: reduced set for fast testing
+  user_provided_config.json # Custom: your own subset
+```
+
+To use a custom config:
+```bash
+cmake ... -DGEMM_UNIVERSAL_CONFIG_FILE="user_provided_config.json"
+```
+
+The config controls which tile sizes (e.g., 128x128x64, 256x256x32), warp
+configurations (e.g., 2x2x1, 1x4x1), pipelines (compv3, compv4, mem),
+schedulers, and other parameters are included in the kernel enumeration.
+
+### Building StreamK / other ops
+
+The same pattern applies to other tile engine ops:
+
+```bash
+# StreamK
+ninja -C build benchmark_gemm_streamk_fp8_rcr
+
+# Grouped convolution
+ninja -C build benchmark_grouped_conv_fwd_fp16_nhwgc
+```
+
+Each op has its own instance builder and config directory.
+
+---
+
+## Part 2: Running Benchmarks and Generating Data
+
+## Quick Start
+
+### 1. Run benchmarks for a set of shapes
+
+Each binary accepts `-m=`, `-n=`, `-k=`, `-warmup=`, `-repeat=`, `-verify=` flags
+and outputs JSON to stdout:
+
+```bash
+/workspace/ck_tile/bin/benchmark_gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128 \
+    -m=1024 -n=1024 -k=1024 -warmup=3 -repeat=10 -verify=0
+```
+
+Output:
+```json
+{
+  "name": "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_...",
+  "problem": {
+    "split_k": 1, "m": 1024, "n": 1024, "k": 1024,
+    "dtype_a": "fp8", "dtype_b": "fp8", ...
+  },
+  "perf_result": {
+    "latency(ms)": 0.04,
+    "tflops(TFlops)": 204.60,
+    "bandwidth(GB/s)": 624.39
+  }
+}
+```
+
+### 2. Batch generation using provided scripts
+
+**Wide coverage (diverse shapes across all regimes):**
+```bash
+python3 generate_wide_coverage.py \
+    --bin_dir /workspace/ck_tile/bin \
+    --out_dir data/wide_coverage \
+    --batch_size 25 \
+    --warmup 3 --repeat 10
+```
+
+**Edge-case dimensions (N=1, K=1, small N/K):**
+```bash
+python3 generate_edge_dims.py
+```
+
+Both scripts write streaming log files that `data_pipeline.py` can parse.
+
+### 3. Parse logs into parquet
+
+```bash
+python3 data_pipeline.py <log_file> \
+    -o data/my_dataset.parquet \
+    --arch gfx950 \
+    --capture_hw
+```
+
+The `--capture_hw` flag runs `rocminfo` once and injects the GPU hardware
+profile (CU count, clock speed, cache sizes, etc.) into every row.
+
+## Canonical Data Schema
+
+Every parquet file follows this schema:
+
+| Column | Type | Description |
+|---|---|---|
+| `op_type` | str | `gemm_universal`, `gemm_streamk`, etc. |
+| `dtype` | str | `fp8`, `fp16`, `bf16`, `bf8` |
+| `layout` | str | `rcr`, `rrr`, `crr`, `ccr` |
+| `arch` | str | `gfx942`, `gfx950`, etc. |
+| `kernel_name` | str | Full kernel identifier |
+| `m`, `n`, `k` | int | Problem dimensions |
+| `split_k` | int | Split-K factor (1 = standard) |
+| `measured_tflops` | float | Ground-truth TFLOPS |
+| `latency_ms` | float | Measured latency |
+| `bandwidth_gb_s` | float | Measured bandwidth |
+| `is_valid` | bool | True if tflops > 0 and latency > 0 |
+| `tile_m`, `tile_n`, `tile_k` | int | Tile dimensions |
+| `warp_m`, `warp_n`, `warp_k` | int | Warp config |
+| `warp_tile_m/n/k` | int | Warp tile dimensions |
+| `pipeline` | str | `compv3`, `compv4`, `mem`, etc. |
+| `scheduler` | str | `intrawave`, `interwave` |
+| `epilogue` | str | `cshuffle`, `default` |
+| `pad_m`, `pad_n`, `pad_k` | bool | Padding flags |
+| `persistent` | bool | Persistent kernel flag |
+| `run_id` | str | Unique collection run identifier |
+
+## Shape Selection Guidelines
+
+Good training data requires diverse shapes. Cover all of these regimes:
+
+### By M dimension (batch size / output rows)
+- **M=1**: single-token inference (hardest case for tiling)
+- **Tiny M (2-16)**: small batch inference
+- **Small M (32-128)**: medium batch
+- **Medium M (256-2048)**: large batch / training
+- **Large M (4096-20480)**: very large batch
+
+### By N and K dimension
+- **N=1**: vector-matrix multiply (degenerate)
+- **K=1**: rank-1 update / outer product (degenerate)
+- **Small N or K (2-16)**: stress tile efficiency
+- **Deep K (K > 4096)**: compute-bound regime
+- **Shallow K (K < 256)**: memory-bound regime
+
+### By shape family
+- **Square**: M ~ N ~ K (powers of 2)
+- **Tall**: M >> N (tall output matrix)
+- **Wide**: N >> M (wide output matrix)
+- **Deep-K**: K >> M and K >> N
+
+### Special cases
+- **Prime dimensions**: 17, 31, 127, 251, 509, 1021, 2039, 4093
+  (worst-case for tile alignment, tests padding logic)
+- **Non-power-of-2**: 48, 96, 192, 384, 576, 768, 1536, 3072, 4608
+  (common in LLM architectures)
+- **LLM inference shapes**: DeepSeek, LLaMA-7B, LLaMA-70B MLP/attention dims
+
+### Minimum recommended coverage
+
+For a production-quality model, aim for:
+- At least 200 unique (M, N, K) shapes
+- At least 10 shapes per shape family
+- All kernel configs (4608 for fp8 RCR) run against every shape
+- Multiple layouts if training a cross-layout model
+
+## Benchmark Quality Guidelines
+
+### Warmup and repeat
+- Minimum `warmup=3`, `repeat=10` for fast iteration
+- Production quality: `warmup=5`, `repeat=20` for stable measurements
+- The `perf_result` values are averaged over `repeat` iterations
+
+### Noise handling
+- Use **median** latency when aggregating multiple runs of the same benchmark
+- Flag measurements where coefficient of variation exceeds 10%
+- Avoid benchmarking under thermal throttling (check GPU temperature)
+- Lock GPU clocks if possible for reproducibility
+
+### Environment metadata
+Store with every dataset:
+- GPU model and architecture (from `rocminfo`)
+- ROCm driver version
+- Clock mode (default / locked)
+- Git hash of the CK tile engine build (if available)
+- Timestamp
+
+## Adding Data for a New Op
+
+To generate benchmark data for a new operation (e.g., `gemm_streamk`):
+
+1. **Build the binaries** using the tile engine:
+   ```bash
+   ninja -C build benchmark_gemm_streamk_fp8_rcr
+   ```
+
+2. **Write a generation script** (or modify `generate_wide_coverage.py`):
+   - Change the executable glob pattern to match the new op
+   - Add any op-specific CLI flags the binaries need
+
+3. **Run and parse**:
+   ```bash
+   python3 data_pipeline.py my_streamk_run.log \
+       -o data/gemm_streamk_fp8_gfx950.parquet --arch gfx950
+   ```
+
+4. **Train**:
+   ```bash
+   python3 train.py --op gemm_streamk --dtype fp8 --arch gfx950 \
+       --data_dir data/ --out_dir models/gemm_streamk_fp8_gfx950
+   ```
+
+## Adding Data for a New Layout
+
+Same binaries, same shapes -- just change the layout filter:
+
+```bash
+# Build rrr binaries
+ninja -C build benchmark_gemm_universal_fp8_rrr
+
+# Generate and parse
+# ... (same flow, different bin_dir or executable glob)
+
+# Train a cross-layout model by putting all layouts in the same data_dir
+python3 train.py --data_dir data/ --out_dir models/gemm_universal_fp8_gfx950_all_layouts
+```
+
+The feature engine includes `layout` as a categorical feature, so one model
+can handle all layouts.
+
+## Incremental Data Collection
+
+When you have a trained model and want to add more data:
+
+1. Generate new data (new shapes, new layouts, etc.)
+2. Parse into parquet alongside existing data
+3. Warm-start from the previous model:
+   ```bash
+   python3 train.py --data_dir data/ --out_dir models/v2 \
+       --warm_start models/v1 \
+       --warm_start_n_estimators 200
+   ```
+
+This adds 200 new trees on top of the existing model. The feature schema
+must match exactly (enforced automatically).
+
+## File Organization
+
+Recommended directory structure:
+
+```
+heuristics/
+  data/
+    gemm_universal_fp8_rcr_gfx950.parquet      # original 108 shapes
+    wide_coverage/                               # batch log files
+      wide_coverage_batch_001.log
+      wide_coverage_batch_002.log
+      ...
+    edge_dims/                                   # N=1, K=1 edge cases
+      edge_dims_batch_001.log
+      ...
+  models/
+    gemm_universal_fp8_gfx950/                  # trained model artifacts
+      model_tflops.lgbm
+      model_latency.lgbm
+      model_bandwidth.lgbm
+      feature_spec.json
+      train_manifest.json
+      cv_metrics_tflops.json
+      eval_report.json
+      ...
+```
+
+## Troubleshooting
+
+### Benchmark binary exits with non-zero code
+Some kernel configs are invalid for certain problem sizes (e.g., tile_m=256
+with M=16). The data pipeline marks these as `is_valid=False` and they are
+filtered out during training. This is expected.
+
+### Edge dims produce very few results
+N=1 and K=1 shapes are degenerate -- most kernel configurations have minimum
+dimension requirements and will fail or produce zero TFLOPS. The small number
+of valid results is still useful (it tells the model which configs work for
+these shapes).
+
+### Benchmarks are slow
+Each shape requires running all 4608 kernel executables sequentially. At
+~0.01s per kernel, that is ~46 seconds per shape. For 700 shapes, expect
+~9 hours. Tips:
+- Run on a dedicated GPU (no other workloads)
+- Use `--batch_size 25` to get incremental output
+- Parse and train on partial data while generation continues
+
+### Data from different GPUs / driver versions
+Store `run_id` and hardware metadata with each dataset. Training on mixed
+data is allowed but not recommended for production models. Filter to a
+single `run_id` or `arch` for clean experiments.
diff --git a/dispatcher/heuristics/LEARNINGS.md b/dispatcher/heuristics/LEARNINGS.md
new file mode 100644
index 0000000000..dba3514601
--- /dev/null
+++ b/dispatcher/heuristics/LEARNINGS.md
@@ -0,0 +1,151 @@
+# Learnings and Design Decisions
+
+Empirical findings from building the CK Tile kernel performance prediction system.
+These inform the current defaults and explain why certain approaches were chosen.
+
+## 1. Log-Transform is Essential for Cross-Scale Accuracy
+
+**Problem**: GEMM TFLOPS spans 5 orders of magnitude across different problem
+sizes. When training on raw TFLOPS, the regression loss (RMSE) is dominated by
+large shapes where absolute errors are biggest. The model learns to predict
+large shapes accurately but ignores tiny shapes where the TFLOPS values are
+much lower.
+
+**Evidence** (168 shapes, 626K rows, 5-fold GroupKFold CV):
+
+
+| Model                         | Mean Eff   | P10 Eff    | tiny_m Eff | Min Eff    |
+| ----------------------------- | ---------- | ---------- | ---------- | ---------- |
+| Raw TFLOPS (500 trees)        | 92.73%     | 80.24%     | 84.55%     | 4.26%      |
+| **log1p(TFLOPS)** (500 trees) | **96.92%** | **94.34%** | **94.89%** | **60.27%** |
+| log1p(TFLOPS) (2000 trees)    | 97.51%     | 93.89%     | 96.04%     | 63.56%     |
+
+
+**Solution**: Train on `log1p(measured_tflops)` and apply `expm1()` to
+predictions. This is now the default in `train.py`. Pass `--no_log_transform`
+to revert to raw regression (not recommended).
+
+**Why log1p, not log**: `log1p(x) = log(1 + x)` handles zero and near-zero
+TFLOPS gracefully, whereas `log(x)` produces -inf for x=0.
+
+## 2. Tiny-M Shapes are the Hardest Case
+
+M=1 (single-token inference) shapes are fundamentally different from batch shapes:
+
+- Most kernel configurations produce very low TFLOPS
+- The "best" kernel is often only marginally better than the rest
+- The oracle performance itself is very low, so any prediction error tanks efficiency
+- Many kernels fail outright (tile_m=128 with M=1 wastes 127/128 of the tile)
+
+The bottom shapes in our evaluation are all M=1, with efficiencies in the
+63-70% range. These shapes have such low absolute performance that the model's
+noise floor exceeds the performance difference between kernels.
+
+**Mitigation**: Log-transform helps significantly (tiny_m improved from 84% to
+96%). For production use with M=1, consider a dedicated fallback (e.g.,
+hardcoded kernel selection for M < 4 based on known-good configs).
+
+## 3. IHEM (Hard Example Mining) Hurts When Scale is the Issue
+
+We tried Iterative Hard Example Mining with sample reweighting (2x-5x weight
+on hard shapes). Result: it made things **worse**, degrading mean efficiency
+from 94.31% to 92.90% over 3 iterations.
+
+**Why**: The hard shapes are hard because of scale mismatch, not because the
+model lacks capacity. Reweighting amplifies the small-TFLOPS rows, which
+distorts the learned relationship between features and performance for the
+majority of shapes. The log-transform was the correct fix -- it addresses the
+root cause (scale) rather than the symptom (bad predictions on tiny shapes).
+
+**Lesson**: IHEM is useful when the model has capacity gaps (e.g., certain
+pipeline types are underrepresented). It is counterproductive when the issue
+is target-variable scale. Always try target transforms before reweighting.
+
+## 4. GroupKFold Key = (M, N, K) Forces Generalization
+
+The validation uses `GroupKFold` where the group key is `(M, N, K)` -- all
+kernels for the same shape go to the same fold. This means:
+
+- The model is always evaluated on shapes it has **never seen** during training
+- Layout is excluded from the key, forcing the model to generalize across layouts
+- Since models are per-arch, `arch` is implicit (constant within one training run)
+
+This is much stricter than random row splitting, where the model would see some
+kernels for each shape during training. Our efficiency numbers are conservative
+estimates of real-world performance on unseen shapes.
+
+## 5. Model Size vs Accuracy Tradeoff
+
+
+| Config             | Trees    | Leaves  | LR       | Mean Eff   | P10 Eff    | Train Time    |
+| ------------------ | -------- | ------- | -------- | ---------- | ---------- | ------------- |
+| Small (default v1) | 500      | 127     | 0.05     | 96.92%     | 94.34%     | ~20s          |
+| **Big (current)**  | **2000** | **255** | **0.02** | **97.51%** | **93.89%** | **~25s/fold** |
+
+
+The bigger model improved mean efficiency by 0.6% but P10 didn't improve
+(actually slightly worse). The extra capacity helps on medium shapes but
+doesn't crack the tiny-M floor. This suggests the feature set, not model
+capacity, is the limiting factor for the hardest shapes.
+
+For C++ deployment, the bigger model (2000 trees, 255 leaves) is still fast
+enough -- LightGBM inference is O(trees * log(leaves)) per sample, which is
+~microseconds even at 2000 trees.
+
+## 6. N=1 and K=1 Shapes are Degenerate
+
+We generated benchmark data for 546 edge-case shapes (N=1, K=1, small N/K).
+Result: **zero valid kernel results** across 94 shapes. All 4608 kernels either
+fail or produce 0 TFLOPS for these degenerate dimensions.
+
+This means:
+
+- The tile engine kernels have hard minimum dimension requirements
+- N=1 / K=1 shapes cannot be handled by the current kernel set
+- These shapes need dedicated kernels (e.g., BLAS-1/BLAS-2 fallbacks)
+- The ML model should not be expected to handle them -- they should be filtered
+out before reaching the heuristic
+
+## 7. Feature Engineering Insights
+
+From LightGBM feature importances on the log-target model:
+
+**Top features** (by split count):
+
+- `M, N, K` -- raw dimensions are always the most important
+- `tile_m, tile_n, tile_k` -- the tile shape is the primary kernel differentiator
+- `overall_tile_efficiency` -- how well the shape fits the tile (the interaction)
+- `num_tiles_m, total_output_tiles` -- work decomposition
+- `arithmetic_intensity` -- compute vs memory bound regime
+- `pipeline` -- pipeline type (compv3 vs compv4 vs mem) significantly affects perf
+
+**Low-importance features**:
+
+- Hardware constants (CUs, clock, caches) -- they're constant within one arch
+model, so they provide no discriminative signal. They'll become important when
+training cross-arch models.
+- `split_k` -- always 1 in current data
+- `persistent` -- rarely True in current kernel set
+
+## 8. Warm-Start Works for Incremental Updates
+
+LightGBM's `init_model` parameter successfully continues training from an
+existing model. New trees are added on top of existing ones. Key considerations:
+
+- Feature schema must match exactly (enforced by `check_feature_compatibility`)
+- Use fewer new trees (200-500) since we're refining, not starting fresh
+- The `train_manifest.json` tracks the full lineage (total trees, data sizes)
+- Quality should be at least as good as the base model (tested)
+
+## 9. Data Volume Matters More Than Model Complexity
+
+
+| Dataset                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | Shapes | Rows | Mean Eff (log, 500 trees)     |
+| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------ | ---- | ----------------------------- |
+| Original (DeepSeek only)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | 108    | 418K | 98.28% (on seen distribution) |
+| + Wide coverage M=1 distribution. Adding 60 diverse shapes (many M=1) exposed the model's weakness on tiny shapes. More diverse training data is always better than a bigger model on narrow data.Summary of DefaultsBased on these findings, the current defaults in `train.py` are:- **Target transform**: `log1p` for TFLOPS and bandwidth (scale normalization)- **Model**: 2000 trees, 255 leaves, max depth 15, LR 0.02- **Validation**: 5-fold GroupKFold, key = (M, N, K)- **Early stopping**: patience 100 (let trees fully converge)- **Warm start**: 500 new trees (was 200, increased for bigger base model) | 168    | 626K | 96.92% (harder distribution)  |
+
+
+The original 108-shape model looked great (98.28%) but was overfitting to the
+DeepSeek LLM inference
+
diff --git a/dispatcher/heuristics/README.md b/dispatcher/heuristics/README.md
new file mode 100644
index 0000000000..91b07466b6
--- /dev/null
+++ b/dispatcher/heuristics/README.md
@@ -0,0 +1,271 @@
+# CK Tile Heuristics: ML-Based Kernel Selection
+
+Fast, accurate kernel selection for CK Tile operations using LightGBM regression
+with Origami-augmented feature engineering.
+
+## What This Does
+
+Instead of running all 4608+ kernel configurations on the GPU to find the best
+one (exhaustive search taking ~46 seconds per shape), this system trains an ML
+model that predicts TFLOPS for any (problem, kernel) pair in microseconds. It
+scores all candidates instantly and picks the best kernel -- achieving 98.28%
+of oracle-best TFLOPS efficiency across 108 tested shapes.
+
+## Quick Start
+
+### 1. Generate and convert benchmark data
+
+**Step 1: Generate benchmark data**
+
+```bash
+python3 generate_benchmark_data.py \
+    --build_dir /path/to/build \
+    --output_dir data/fp16_original \
+    --dtype fp16 \
+    --layout rcr \
+    --num_build_jobs 4 \
+    --warmup 10 \
+    --repeat 50
+```
+
+This outputs JSON with all benchmark results.
+
+**Step 2: Convert JSON to parquet training format**
+
+```bash
+python3 convert_json_to_parquet.py \
+    --input data/fp16_original/benchmark_results_fp16_rcr.json \
+    --output data/fp16_original/fp16_training_data.parquet \
+    --arch gfx950
+```
+
+The converter automatically fixes pad flags for `_mem` kernels and validates data.
+
+**Alternative: Parse existing logs**
+
+If you have raw benchmark logs from CK Tile:
+
+```bash
+python3 data_pipeline.py ck_tile_testrun_2.log \
+    -o data/gemm_universal_fp8_rcr_gfx950.parquet \
+    --arch gfx950 --capture_hw
+```
+
+### 2. Train a model
+
+```bash
+python3 train.py \
+    --data_dir data/ \
+    --out_dir models/gemm_universal_fp8_gfx950 \
+    --op gemm_universal --dtype fp8 --arch gfx950
+```
+
+**Note**: Trained models are automatically compressed to `.lgbm.gz` format to save space (~67% reduction). The Python tools automatically decompress them on first use and cache the decompressed version. For warm-start training, decompression happens automatically.
+
+### 3. Evaluate
+
+```bash
+python3 evaluate.py \
+    --model_dir models/gemm_universal_fp8_gfx950 \
+    --data_dir data/ --op gemm_universal --dtype fp8
+```
+
+### 4. Predict the best kernel for a problem
+
+```bash
+python3 predict.py \
+    --model_dir models/gemm_universal_fp8_gfx950 \
+    --m 128 --n 1536 --k 7168 --layout rcr
+```
+
+### 5. Search for optimal configs (optional)
+
+```bash
+python3 search.py \
+    --model_dir models/gemm_universal_fp8_gfx950 \
+    --m 128 --n 1536 --k 7168 \
+    --strategy random --budget 500 --top_k 10
+```
+
+### 6. Using models in C++ (requires decompression)
+
+C++ code uses the LightGBM C API which requires uncompressed `.lgbm` files. If you have compressed models (`.lgbm.gz`), decompress them first:
+
+```bash
+cd models/gemm_universal_fp16_gfx950
+gunzip model_tflops.lgbm.gz
+```
+
+Then use in C++ examples:
+
+```bash
+cd dispatcher/build
+./gemm_09_ml_heuristic --model ../heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm
+```
+
+**Note**: Python tools automatically decompress `.lgbm.gz` files on first use, so you can run Python scripts first to trigger decompression, then use the same models in C++.
+
+## Architecture
+
+```
+Problem (M, N, K, dtype, layout)
+    |
+    v
+FeatureEngine.extract_batch()    <-- 55 features: problem, kernel, interaction, hardware
+    |
+    v
+LGBMRegressor.predict()          <-- predicts TFLOPS for each candidate kernel
+    |
+    v
+Sort by predicted TFLOPS          <-- rank all candidates
+    |
+    v
+Select Top-1 kernel               <-- 98.28% mean efficiency, <1ms inference
+```
+
+Three models are trained per (op, dtype, arch):
+- **TFLOPS model** (primary): used for kernel ranking
+- **Latency model** (auxiliary): for latency-sensitive workloads
+- **Bandwidth model** (auxiliary): for memory-bound analysis
+
+## File Inventory
+
+| File | Purpose |
+|---|---|
+| `generate_benchmark_data.py` | Build and run benchmarks across ~25 diverse problem sizes, output JSON |
+| `convert_json_to_parquet.py` | Convert benchmark JSON to parquet training format, fix `_mem` pad flags |
+| `data_pipeline.py` | Parse raw benchmark logs into canonical parquet datasets |
+| `feature_engine.py` | 55-feature extraction: problem, kernel, interaction, hardware profile |
+| `train.py` | Multi-target LGBMRegressor training with GroupKFold CV, IHEM, warm-start |
+| `predict.py` | Predictor class: predict TFLOPS/latency/bandwidth, rank kernels |
+| `evaluate.py` | Full evaluation: global metrics, per-shape/layout/pipeline slices |
+| `search.py` | Surrogate search: discrete DE, random top-K |
+| `generate_wide_coverage.py` | Generate benchmark data across 706 diverse shapes |
+| `generate_edge_dims.py` | Generate N=1, K=1, and other edge-case shapes |
+| `DATA_GENERATION.md` | Detailed guide for building binaries and generating data |
+| `plan.md` | Full design plan with architecture, milestones, and rationale |
+
+## Features Used (55 total)
+
+### Problem features (13)
+`M, N, K, split_k, log2(M), log2(N), log2(K), log2(MNK),
+arithmetic_intensity, aspect_ratio_mn, aspect_ratio_mk, aspect_ratio_nk, layout`
+
+### Kernel features (17)
+`tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n,
+warp_tile_k, pipeline, scheduler, epilogue, pad_m, pad_n, pad_k, persistent,
+num_warps, tile_volume, tile_mn, lds_usage_estimate, lds_usage_ratio`
+
+### Interaction features (9)
+`num_tiles_m, num_tiles_n, num_tiles_k, total_output_tiles,
+tile_eff_m, tile_eff_n, tile_eff_k, overall_tile_efficiency, cu_utilization`
+
+### Hardware profile features (12)
+`hw_num_cus, hw_simds_per_cu, hw_total_simds, hw_shader_engines,
+hw_max_clock_mhz, hw_max_waves_per_cu, hw_wavefront_size, hw_lds_capacity,
+hw_l1_cache_kb, hw_l2_cache_kb, hw_l3_cache_kb, hw_num_xcd`
+
+## Model Performance
+
+### fp8 RCR, gfx950
+
+| Metric | 108 shapes (original) | 168 shapes (wide coverage) |
+|---|---|---|
+| Mean TFLOPS Efficiency | 98.28% | 97.51% |
+| P10 TFLOPS Efficiency | 94.64% | 93.89% |
+| tiny_m (M=1) Efficiency | 95.57% | 96.04% |
+| R2 (TFLOPS) | 0.997 | 0.993 |
+
+### fp16 RCR, gfx950
+
+Trained on 25 shapes, 1,024 kernels, 21,920 valid benchmarks.
+
+| Metric | Value |
+|---|---|
+| Mean TFLOPS Efficiency | 99.36% |
+| P10 TFLOPS Efficiency | 98.05% |
+| P50 TFLOPS Efficiency | 100.00% |
+| Min Efficiency | 95.45% |
+| NDCG@1 | 64.00% |
+| Top-5 Hit Rate | 88.00% |
+
+**Shape Family Breakdown:**
+
+| Shape Family | Mean Eff | P10 Eff | Shapes |
+|---|---|---|---|
+| Large M (M≥1024) | 99.54% | 99.07% | 4 |
+| Medium M (128≤M<1024) | 99.62% | 98.74% | 7 |
+| Small M (8≤M<128) | 98.82% | 96.22% | 8 |
+| Tiny M (M<8) | 99.65% | 98.96% | 6 |
+
+**Pipeline Breakdown:**
+
+| Pipeline | Mean Eff | P10 Eff |
+|---|---|---|
+| compv3 | 99.75% | 99.09% |
+| compv4 | 99.40% | 98.54% |
+| mem | 99.08% | 96.59% |
+
+Training uses `log1p(TFLOPS)` as the target by default, which normalizes the
+scale across shapes spanning 0.02 to 2230 TFLOPS. This was the key finding
+that improved tiny-M shapes from 84% to 96% efficiency. See
+[LEARNINGS.md](LEARNINGS.md) for details.
+
+## Validation
+
+Training uses `GroupKFold(n_splits=5)` with group key `(M, N, K)` to ensure
+the model is evaluated on shapes it has never seen during training. Layout is
+excluded from the group key to force cross-layout generalization.
+
+## Incremental Training (Warm Start)
+
+When new benchmark data arrives, update the model without retraining from scratch:
+
+```bash
+python3 train.py \
+    --data_dir data/ \
+    --out_dir models/v2 \
+    --warm_start models/gemm_universal_fp8_gfx950 \
+    --warm_start_n_estimators 200
+```
+
+This adds 200 new trees on top of the existing model. Feature schemas must
+match exactly (automatically enforced).
+
+## Extending to New Ops
+
+Adding support for a new operation (e.g., `gemm_streamk`, `grouped_conv`):
+
+1. **Build binaries**: `ninja -C build benchmark_gemm_streamk_fp8_rcr`
+2. **Subclass `FeatureEngine`**: add op-specific features (e.g., StreamK split factor)
+3. **Generate data**: run benchmarks across diverse shapes
+4. **Train**: `python3 train.py --op gemm_streamk --dtype fp8 --data_dir data/ --out_dir models/`
+
+The training, evaluation, prediction, and search infrastructure is fully
+op-agnostic -- only the feature engine needs a new subclass.
+
+## Tests
+
+102 tests covering all modules:
+
+```bash
+python3 -m pytest tests/ -v
+```
+
+Test coverage includes:
+- Log parsing with malformed JSON, empty logs, single-kernel shapes
+- Feature formula correctness (tile efficiency, LDS usage, arithmetic intensity)
+- Corner-case shapes: M=1, N=1, K=1, prime dimensions, 20480x7168x256
+- Batch vs single extraction parity
+- Parameter space validation and projection
+- Predictor: single/batch prediction, ranking, missing models, empty inputs
+- Training: group keys, efficiency computation, warm-start, feature compatibility
+- Search: random, DE, config validity, determinism
+
+## Documentation
+
+- **[README.md](README.md)**: This file -- quick start, architecture, performance
+- **[DATA_GENERATION.md](DATA_GENERATION.md)**: Complete guide for building tile engine
+  binaries, running benchmarks, managing datasets, and troubleshooting
+- **[LEARNINGS.md](LEARNINGS.md)**: Empirical findings and design decisions (log-transform,
+  IHEM results, tiny-M analysis, feature importance, N=1/K=1 edge cases)
diff --git a/dispatcher/heuristics/__init__.py b/dispatcher/heuristics/__init__.py
new file mode 100644
index 0000000000..e208c91163
--- /dev/null
+++ b/dispatcher/heuristics/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# CK Tile Heuristics: ML-based kernel selection
diff --git a/dispatcher/heuristics/collect_additional.sh b/dispatcher/heuristics/collect_additional.sh
new file mode 100755
index 0000000000..d963b1483a
--- /dev/null
+++ b/dispatcher/heuristics/collect_additional.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# Generate additional benchmark data for shapes NOT in the original log.
+# Runs in background; outputs streaming JSON that can be parsed by data_pipeline.py.
+
+BIN_DIR="/workspace/ck_tile/bin"
+OUT_LOG="data/additional_shapes.log"
+WARMUP=3
+REPEAT=10
+
+mkdir -p data
+
+# Additional shapes: square powers-of-2 and common ML sizes not in original DeepSeek set
+SHAPES=(
+    "64,64,64"
+    "128,128,128"
+    "256,256,256"
+    "512,512,512"
+    "1024,1024,1024"
+    "2048,2048,2048"
+    "4096,4096,4096"
+    "1,4096,4096"
+    "8,4096,4096"
+    "32,4096,4096"
+    "128,4096,4096"
+    "1,4096,11008"
+    "32,4096,11008"
+    "1,8192,8192"
+    "32,8192,8192"
+    "1,8192,28672"
+    "32,8192,28672"
+    "256,256,8192"
+    "8192,8192,256"
+    "1024,4096,1024"
+    "4096,1024,4096"
+    "2048,8192,2048"
+)
+
+echo "CK Tile Additional Shapes Benchmark" > "$OUT_LOG"
+echo "GPU ID: 0" >> "$OUT_LOG"
+echo "Implementation: gemm_universal" >> "$OUT_LOG"
+echo "" >> "$OUT_LOG"
+
+SHAPE_IDX=0
+for SHAPE in "${SHAPES[@]}"; do
+    IFS=',' read -r M N K <<< "$SHAPE"
+    SHAPE_IDX=$((SHAPE_IDX + 1))
+
+    echo "========================================" >> "$OUT_LOG"
+    echo "Shape $SHAPE_IDX: M=$M N=$N K=$K dtype=fp8 layout=rcr" >> "$OUT_LOG"
+    echo "========================================" >> "$OUT_LOG"
+
+    KERNEL_COUNT=0
+    for EXE in "$BIN_DIR"/benchmark_gemm_universal_fp8_rcr_*; do
+        KERNEL_COUNT=$((KERNEL_COUNT + 1))
+        OUTPUT=$("$EXE" -m="$M" -n="$N" -k="$K" -warmup=$WARMUP -repeat=$REPEAT -verify=0 2>/dev/null)
+        # Extract just the JSON block
+        echo "$OUTPUT" | sed -n '/{/,/^}/p' >> "$OUT_LOG"
+    done
+
+    echo "Found $KERNEL_COUNT kernels" >> "$OUT_LOG"
+    echo "Completed shape $SHAPE_IDX: M=$M N=$N K=$K ($KERNEL_COUNT kernels)" >&2
+done
+
+echo "Done generating additional data" >&2
diff --git a/dispatcher/heuristics/convert_json_to_parquet.py b/dispatcher/heuristics/convert_json_to_parquet.py
new file mode 100644
index 0000000000..4cfd667c76
--- /dev/null
+++ b/dispatcher/heuristics/convert_json_to_parquet.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Convert benchmark JSON results to parquet format for training.
+
+Usage:
+    python convert_json_to_parquet.py \
+        --input benchmark_results_fp16_rcr.json \
+        --output fp16_training_data.parquet
+
+Features:
+    - Converts JSON benchmark results to flat row format
+    - Automatically fixes pad flags for _mem kernels
+    - Captures both successes and failures
+    - Compatible with existing training data format
+"""
+
+import argparse
+import json
+import pandas as pd
+from pathlib import Path
+
+
+def convert_json_to_parquet(json_file: Path, output_file: Path, arch: str = "gfx950"):
+    """Convert benchmark JSON to parquet training data format."""
+
+    print(f"Loading {json_file}...")
+    with open(json_file) as f:
+        data = json.load(f)
+
+    metadata = data.get("metadata", {})
+    dtype = metadata.get("dtype", "fp16")
+    layout = metadata.get("layout", "rcr")
+
+    print(f"  Data type: {dtype}")
+    print(f"  Layout: {layout}")
+    print(f"  Kernels: {metadata.get('num_kernels', 0)}")
+    print(f"  Problem sizes: {metadata.get('num_problems', 0)}")
+    print()
+
+    rows = []
+    for kernel_result in data["results"]:
+        kernel_config = kernel_result["kernel_config"]
+
+        for benchmark in kernel_result["benchmarks"]:
+            # Common fields for both valid and invalid runs
+            row = {
+                "op_type": "gemm_universal",
+                "dtype": dtype,
+                "layout": layout,
+                "arch": arch,
+                "kernel_name": kernel_config["name"],
+                "m": benchmark["m"],
+                "n": benchmark["n"],
+                "k": benchmark["k"],
+                "split_k": 1,
+                "is_valid": benchmark["is_valid"],
+                "run_id": 0,
+                "pipeline": kernel_config["pipeline"],
+                "epilogue": kernel_config["epilogue"],
+                "scheduler": kernel_config["scheduler"],
+                "pad_m": kernel_config["pad_m"],
+                "pad_n": kernel_config["pad_n"],
+                "pad_k": kernel_config["pad_k"],
+                "persistent": kernel_config["persistent"],
+                "tile_m": kernel_config["tile_m"],
+                "tile_n": kernel_config["tile_n"],
+                "tile_k": kernel_config["tile_k"],
+                "warp_m": kernel_config["warp_m"],
+                "warp_n": kernel_config["warp_n"],
+                "warp_k": kernel_config["warp_k"],
+                "warp_tile_m": kernel_config["warp_tile_m"],
+                "warp_tile_n": kernel_config["warp_tile_n"],
+                "warp_tile_k": kernel_config["warp_tile_k"],
+            }
+
+            if benchmark["is_valid"]:
+                # Valid run - include performance metrics
+                row["measured_tflops"] = benchmark["tflops"]
+                row["latency_ms"] = benchmark["avg_time_ms"]
+                # Calculate bandwidth if needed
+                m, n, k = benchmark["m"], benchmark["n"], benchmark["k"]
+                bytes_transferred = (m * k + k * n + m * n) * 2  # FP16 = 2 bytes
+                if benchmark["avg_time_ms"] > 0:
+                    row["bandwidth_gb_s"] = (bytes_transferred / 1e9) / (
+                        benchmark["avg_time_ms"] / 1000
+                    )
+                else:
+                    row["bandwidth_gb_s"] = 0.0
+            else:
+                # Failed run - zero metrics
+                row["measured_tflops"] = 0.0
+                row["latency_ms"] = 0.0
+                row["bandwidth_gb_s"] = 0.0
+
+            rows.append(row)
+
+    df = pd.DataFrame(rows)
+
+    print(f"Converted {len(df):,} benchmark results")
+    print(f"  Valid: {df['is_valid'].sum():,}")
+    print(f"  Failed: {(~df['is_valid']).sum():,}")
+    print()
+
+    # Fix pad flags for _mem kernels (critical for P1 features!)
+    print("Fixing pad flags for _mem kernels...")
+    mem_mask = df["pipeline"] == "mem"
+    mem_count = mem_mask.sum()
+
+    if mem_count > 0:
+        df.loc[mem_mask, "pad_m"] = True
+        df.loc[mem_mask, "pad_n"] = True
+        df.loc[mem_mask, "pad_k"] = True
+        print(f"  ✓ Fixed {mem_count:,} _mem kernel rows")
+        print()
+
+    # Save to parquet
+    df.to_parquet(output_file, index=False)
+    print(f"✓ Saved to {output_file}")
+    print()
+
+    # Show statistics
+    print("=" * 80)
+    print("STATISTICS")
+    print("=" * 80)
+    print()
+
+    print("Dimension ranges:")
+    print(f"  M: {df['m'].min():,} - {df['m'].max():,}")
+    print(f"  N: {df['n'].min():,} - {df['n'].max():,}")
+    print(f"  K: {df['k'].min():,} - {df['k'].max():,}")
+    print()
+
+    print("Pipeline distribution:")
+    print(df["pipeline"].value_counts())
+    print()
+
+    print("Pad flag distribution:")
+    pad_combos = df[["pad_m", "pad_n", "pad_k"]].value_counts()
+    print(pad_combos)
+    print()
+
+    if (~df["is_valid"]).sum() > 0:
+        print("Failure analysis:")
+        failed = df[~df["is_valid"]]
+        print(f"  Total failures: {len(failed):,}")
+
+        # Group by pipeline
+        print("\n  By pipeline:")
+        for pipeline, count in failed["pipeline"].value_counts().items():
+            print(f"    {pipeline}: {count:,}")
+
+        # Show sample failures
+        print("\n  Sample failures:")
+        for _, row in failed.head(5).iterrows():
+            print(
+                f"    {row['kernel_name'][:60]:60s} M={row['m']:4d} N={row['n']:4d} K={row['k']:4d}"
+            )
+
+    return df
+
+
+def merge_datasets(parquet_files: list[Path], output_file: Path):
+    """Merge multiple parquet files into one."""
+
+    print("=" * 80)
+    print("MERGING DATASETS")
+    print("=" * 80)
+    print()
+
+    dfs = []
+    for pq_file in parquet_files:
+        if pq_file.exists():
+            df = pd.read_parquet(pq_file)
+            print(f"  {pq_file.name}: {len(df):,} rows")
+            dfs.append(df)
+        else:
+            print(f"  ✗ {pq_file} not found, skipping")
+
+    if not dfs:
+        print("No files to merge!")
+        return
+
+    combined = pd.concat(dfs, ignore_index=True)
+    combined.to_parquet(output_file, index=False)
+
+    print()
+    print(f"✓ Merged {len(combined):,} total rows to {output_file}")
+    print()
+
+    # Show dtype distribution
+    print("Data type distribution:")
+    print(combined["dtype"].value_counts())
+    print()
+
+    print("Layout distribution:")
+    print(combined["layout"].value_counts())
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert benchmark JSON to parquet training data",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--input", type=str, required=True, help="Input JSON file from benchmark"
+    )
+    parser.add_argument("--output", type=str, required=True, help="Output parquet file")
+    parser.add_argument("--arch", type=str, default="gfx950", help="GPU architecture")
+    parser.add_argument(
+        "--merge_with", type=str, nargs="*", help="Additional parquet files to merge"
+    )
+
+    args = parser.parse_args()
+
+    input_file = Path(args.input)
+    output_file = Path(args.output)
+
+    # Convert JSON to parquet
+    df = convert_json_to_parquet(input_file, output_file, args.arch)
+
+    # Merge if requested
+    if args.merge_with:
+        merge_files = [output_file] + [Path(f) for f in args.merge_with]
+        merged_output = output_file.parent / f"{output_file.stem}_merged.parquet"
+        merge_datasets(merge_files, merged_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/data_pipeline.py b/dispatcher/heuristics/data_pipeline.py
new file mode 100644
index 0000000000..c3f5f9ced7
--- /dev/null
+++ b/dispatcher/heuristics/data_pipeline.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Data pipeline for CK Tile heuristics.
+
+Parses benchmark logs and structured JSON into a canonical parquet dataset.
+Supports:
+  - Streaming log format (Shape N: headers + inline JSON) from ck_tile profiling runs
+  - Structured JSON from generate_benchmark_data.py
+  - Direct parquet passthrough
+"""
+
+import json
+import re
+import subprocess
+import hashlib
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+
+
+CANONICAL_COLUMNS = [
+    "op_type",
+    "dtype",
+    "layout",
+    "arch",
+    "kernel_name",
+    "m",
+    "n",
+    "k",
+    "split_k",
+    "measured_tflops",
+    "latency_ms",
+    "bandwidth_gb_s",
+    "is_valid",
+    "tile_m",
+    "tile_n",
+    "tile_k",
+    "warp_m",
+    "warp_n",
+    "warp_k",
+    "warp_tile_m",
+    "warp_tile_n",
+    "warp_tile_k",
+    "pipeline",
+    "scheduler",
+    "epilogue",
+    "pad_m",
+    "pad_n",
+    "pad_k",
+    "persistent",
+    "run_id",
+]
+
+
+def parse_kernel_name(name: str) -> dict:
+    """Extract kernel config fields from a gemm_universal kernel name.
+
+    Name format:
+      gemm_universal_{dtype}_{layout}_{pipeline}_{epilogue}_{scheduler}
+      _{padM}_{padN}_{padK}_{persistent}_{tileM}x{tileN}x{tileK}
+      _{warpM}x{warpN}x{warpK}_{warpTileM}x{warpTileN}x{warpTileK}
+    """
+    result = {}
+    try:
+        prefix_match = re.match(
+            r"gemm_universal_(\w+?)_((?:rcr|rrr|crr|ccr))_(.*)", name
+        )
+        if not prefix_match:
+            return result
+        result["dtype"] = prefix_match.group(1)
+        result["layout"] = prefix_match.group(2)
+        remainder = prefix_match.group(3)
+
+        parts = remainder.split("_")
+        if len(parts) < 10:
+            return result
+
+        result["pipeline"] = parts[0]
+        result["epilogue"] = parts[1]
+        result["scheduler"] = parts[2]
+        result["pad_m"] = parts[3] == "True"
+        result["pad_n"] = parts[4] == "True"
+        result["pad_k"] = parts[5] == "True"
+        result["persistent"] = parts[6] == "True"
+
+        tile_dims = parts[7].split("x")
+        warp_dims = parts[8].split("x")
+        warp_tile_dims = parts[9].split("x")
+
+        result["tile_m"] = int(tile_dims[0])
+        result["tile_n"] = int(tile_dims[1])
+        result["tile_k"] = int(tile_dims[2])
+        result["warp_m"] = int(warp_dims[0])
+        result["warp_n"] = int(warp_dims[1])
+        result["warp_k"] = int(warp_dims[2])
+        result["warp_tile_m"] = int(warp_tile_dims[0])
+        result["warp_tile_n"] = int(warp_tile_dims[1])
+        result["warp_tile_k"] = int(warp_tile_dims[2])
+    except (IndexError, ValueError):
+        pass
+    return result
+
+
+def _layout_from_problem(problem: dict) -> str:
+    """Derive layout shorthand (rcr/rrr/etc.) from problem JSON fields."""
+    la = problem.get("layout_a", "")
+    lb = problem.get("layout_b", "")
+    lc = problem.get("layout_c", "")
+
+    def _tag(s):
+        s = s.lower()
+        if "row" in s:
+            return "r"
+        if "col" in s:
+            return "c"
+        return "?"
+
+    return _tag(la) + _tag(lb) + _tag(lc)
+
+
+def parse_streaming_log(
+    path: str | Path,
+    arch: str = "unknown",
+    run_id: Optional[str] = None,
+    op_type: str = "gemm_universal",
+) -> pd.DataFrame:
+    """Parse a CK Tile streaming benchmark log into a canonical DataFrame.
+
+    The log alternates between shape headers and JSON result blocks:
+        Shape N: M=16 N=1536 K=7168 dtype=fp8 layout=rcr
+        {
+          "name": "gemm_universal_...",
+          "problem": { ... },
+          "perf_result": { "latency(ms)": ..., "tflops(TFlops)": ..., "bandwidth(GB/s)": ... }
+        }
+    """
+    path = Path(path)
+    if run_id is None:
+        run_id = hashlib.md5(path.name.encode()).hexdigest()[:12]
+
+    shape_re = re.compile(
+        r"Shape\s+\d+:\s+M=(\d+)\s+N=(\d+)\s+K=(\d+)\s+dtype=(\w+)\s+layout=(\w+)"
+    )
+
+    rows = []
+    current_m, current_n, current_k = 0, 0, 0
+    current_dtype, current_layout = "", ""
+    json_buf = []
+    brace_depth = 0
+
+    with open(path, "r") as f:
+        for line in f:
+            stripped = line.strip()
+
+            shape_match = shape_re.search(stripped)
+            if shape_match:
+                current_m = int(shape_match.group(1))
+                current_n = int(shape_match.group(2))
+                current_k = int(shape_match.group(3))
+                current_dtype = shape_match.group(4)
+                current_layout = shape_match.group(5)
+                continue
+
+            if brace_depth == 0 and stripped.startswith("{"):
+                json_buf = [stripped]
+                brace_depth = stripped.count("{") - stripped.count("}")
+                if brace_depth == 0:
+                    raw = "\n".join(json_buf)
+                    try:
+                        obj = json.loads(raw)
+                    except json.JSONDecodeError:
+                        continue
+                else:
+                    continue
+            elif brace_depth > 0:
+                json_buf.append(stripped)
+                brace_depth += stripped.count("{") - stripped.count("}")
+                if brace_depth <= 0:
+                    brace_depth = 0
+                    raw = "\n".join(json_buf)
+                    try:
+                        obj = json.loads(raw)
+                    except json.JSONDecodeError:
+                        continue
+                else:
+                    continue
+            else:
+                continue
+
+            # If we get here, obj was successfully parsed
+            kernel_name = obj.get("name", "")
+            problem = obj.get("problem", {})
+            perf = obj.get("perf_result", {})
+
+            m = problem.get("m", current_m)
+            n = problem.get("n", current_n)
+            k = problem.get("k", current_k)
+            split_k = problem.get("split_k", 1)
+            dtype = problem.get("dtype_a", current_dtype)
+            layout = (
+                _layout_from_problem(problem)
+                if problem.get("layout_a")
+                else current_layout
+            )
+
+            tflops = perf.get("tflops(TFlops)", 0.0)
+            latency = perf.get("latency(ms)", 0.0)
+            bandwidth = perf.get("bandwidth(GB/s)", 0.0)
+
+            kp = parse_kernel_name(kernel_name)
+
+            row = {
+                "op_type": op_type,
+                "dtype": dtype,
+                "layout": layout,
+                "arch": arch,
+                "kernel_name": kernel_name,
+                "m": m,
+                "n": n,
+                "k": k,
+                "split_k": split_k,
+                "measured_tflops": tflops,
+                "latency_ms": latency,
+                "bandwidth_gb_s": bandwidth,
+                "is_valid": tflops > 0 and latency > 0,
+                "run_id": run_id,
+            }
+            row.update(kp)
+            rows.append(row)
+
+    df = pd.DataFrame(rows)
+    for col in CANONICAL_COLUMNS:
+        if col not in df.columns:
+            df[col] = None
+    return df
+
+
+def get_hardware_profile() -> dict:
+    """Capture GPU hardware profile from rocminfo."""
+    profile = {}
+    try:
+        result = subprocess.run(
+            ["rocminfo"], capture_output=True, text=True, timeout=30
+        )
+        output = result.stdout
+
+        gpu_section = False
+        for line in output.split("\n"):
+            line = line.strip()
+            if "Device Type:" in line and "GPU" in line:
+                gpu_section = True
+                continue
+            if gpu_section and "Device Type:" in line and "GPU" not in line:
+                break
+            if not gpu_section:
+                continue
+
+            if ":" not in line:
+                continue
+            key, _, val = line.partition(":")
+            key = key.strip()
+            val = val.strip()
+
+            mapping = {
+                "Name": "gfx_name",
+                "Marketing Name": "marketing_name",
+                "Compute Unit": "num_cus",
+                "SIMDs per CU": "simds_per_cu",
+                "Shader Engines": "shader_engines",
+                "Shader Arrs. per Eng.": "shader_arrays_per_engine",
+                "Max Clock Freq. (MHz)": "max_clock_mhz",
+                "Wavefront Size": "wavefront_size",
+                "Max Waves Per CU": "max_waves_per_cu",
+                "Chip ID": "chip_id",
+            }
+
+            if key in mapping:
+                raw = val.split("(")[0].strip()
+                try:
+                    profile[mapping[key]] = int(raw)
+                except ValueError:
+                    profile[mapping[key]] = raw
+
+        for line in output.split("\n"):
+            line = line.strip()
+            if line.startswith("L1:") and "num_cus" in profile:
+                raw = line.split(":")[1].strip().split("(")[0].strip()
+                try:
+                    profile["l1_cache_kb"] = int(raw)
+                except ValueError:
+                    pass
+            elif line.startswith("L2:"):
+                raw = line.split(":")[1].strip().split("(")[0].strip()
+                try:
+                    profile["l2_cache_kb"] = int(raw)
+                except ValueError:
+                    pass
+            elif line.startswith("L3:"):
+                raw = line.split(":")[1].strip().split("(")[0].strip()
+                try:
+                    profile["l3_cache_kb"] = int(raw)
+                except ValueError:
+                    pass
+
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        pass
+
+    return profile
+
+
+def load_parquet(path: str | Path) -> pd.DataFrame:
+    """Load a canonical parquet dataset."""
+    return pd.read_parquet(path)
+
+
+def save_parquet(df: pd.DataFrame, path: str | Path):
+    """Save a DataFrame in canonical parquet format."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(path, index=False, engine="pyarrow")
+
+
+def build_training_dataset(
+    data_dir: str | Path,
+    op_type: str = "gemm_universal",
+    dtype: str = "fp8",
+) -> pd.DataFrame:
+    """Load and merge all parquet files matching the given op/dtype from a directory."""
+    data_dir = Path(data_dir)
+    frames = []
+    for f in sorted(data_dir.glob("*.parquet")):
+        df = pd.read_parquet(f)
+        if "op_type" in df.columns:
+            df = df[df["op_type"] == op_type]
+        if "dtype" in df.columns:
+            df = df[df["dtype"] == dtype]
+        if len(df) > 0:
+            frames.append(df)
+    if not frames:
+        raise FileNotFoundError(
+            f"No parquet files with op_type={op_type}, dtype={dtype} in {data_dir}"
+        )
+    return pd.concat(frames, ignore_index=True)
+
+
+if __name__ == "__main__":
+    import argparse
+    import time
+
+    parser = argparse.ArgumentParser(description="Parse CK Tile benchmark data")
+    parser.add_argument("input", help="Input file (log or parquet)")
+    parser.add_argument("--output", "-o", required=True, help="Output parquet path")
+    parser.add_argument("--arch", default="gfx950", help="GPU architecture")
+    parser.add_argument("--op_type", default="gemm_universal", help="Operation type")
+    parser.add_argument(
+        "--capture_hw",
+        action="store_true",
+        help="Capture hardware profile from rocminfo",
+    )
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+
+    print(f"Parsing {input_path}...")
+    t0 = time.time()
+
+    if input_path.suffix == ".parquet":
+        df = load_parquet(input_path)
+    else:
+        df = parse_streaming_log(input_path, arch=args.arch, op_type=args.op_type)
+
+    elapsed = time.time() - t0
+    print(f"Parsed {len(df)} rows in {elapsed:.1f}s")
+    print(f"  Unique shapes: {df.groupby(['m', 'n', 'k']).ngroups}")
+    print(f"  Unique kernels: {df['kernel_name'].nunique()}")
+    print(f"  Valid rows: {df['is_valid'].sum()} / {len(df)}")
+
+    if df["measured_tflops"].max() > 0:
+        print(
+            f"  TFLOPS range: {df['measured_tflops'].min():.2f} - {df['measured_tflops'].max():.2f}"
+        )
+
+    if args.capture_hw:
+        hw = get_hardware_profile()
+        print(f"  Hardware profile: {hw}")
+        for k, v in hw.items():
+            df[f"hw_{k}"] = v
+
+    save_parquet(df, args.output)
+    print(f"Saved to {args.output}")
diff --git a/dispatcher/heuristics/dispatcher_integration.py b/dispatcher/heuristics/dispatcher_integration.py
new file mode 100644
index 0000000000..c449c1e816
--- /dev/null
+++ b/dispatcher/heuristics/dispatcher_integration.py
@@ -0,0 +1,324 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Dispatcher integration for ML-based kernel selection.
+
+Bridges the trained LightGBM Predictor with the CK Tile dispatcher's
+kernel selection flow. Provides heuristic functions compatible with
+both the Python pre-selection pattern (08_heuristics.py style) and
+the C++ HeuristicFunction signature.
+
+Name mapping between feature engine and dispatcher KernelConfig:
+    Feature engine          Dispatcher KernelConfig
+    ---------------------   ----------------------
+    warp_m (warps/block)    wave_m
+    warp_n                  wave_n
+    warp_k                  wave_k
+    warp_tile_m             warp_m
+    warp_tile_n             warp_n
+    warp_tile_k             warp_k
+
+Usage:
+    from dispatcher_integration import create_ml_heuristic
+
+    heuristic = create_ml_heuristic("models/gemm_universal_fp8_gfx950")
+    best_spec = heuristic(M=1024, N=1024, K=1024, kernel_pool=KERNEL_POOL)
+"""
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+
+from data_pipeline import parse_kernel_name
+from predict import Predictor
+
+
+LAYOUT_TO_DISPATCHER = {
+    "rcr": ("row", "col", "row"),
+    "rrr": ("row", "row", "row"),
+    "crr": ("col", "row", "row"),
+    "ccr": ("col", "col", "row"),
+}
+
+DTYPE_TO_C_DTYPE = {
+    "fp8": "fp16",
+    "fp16": "fp16",
+    "bf16": "bf16",
+    "fp32": "fp32",
+}
+
+
+@dataclass
+class MLKernelSpec:
+    """Kernel spec returned by the ML heuristic, compatible with the dispatcher
+    example pattern. Carries both the feature-engine-space config and the
+    dispatcher-space KernelConfig fields."""
+
+    kernel_name: str
+    predicted_tflops: float
+
+    tile_m: int
+    tile_n: int
+    tile_k: int
+    wave_m: int
+    wave_n: int
+    wave_k: int
+    warp_m: int
+    warp_n: int
+    warp_k: int
+    pipeline: str
+    scheduler: str
+    epilogue: str
+    pad_m: bool
+    pad_n: bool
+    pad_k: bool
+    persistent: bool
+
+
+def kernel_config_to_feature_dict(kernel_name: str) -> dict:
+    """Parse a tile-engine kernel name into a feature-engine-compatible dict.
+
+    Returns a dict with fields matching what GemmUniversalFeatureEngine.extract()
+    expects for the kernel parameter: tile_m/n/k, warp_m/n/k (warps per block),
+    warp_tile_m/n/k, pipeline, scheduler, epilogue, pad_m/n/k, persistent.
+    """
+    parsed = parse_kernel_name(kernel_name)
+    if not parsed:
+        return {}
+    parsed["kernel_name"] = kernel_name
+    return parsed
+
+
+def feature_dict_to_dispatcher_config(
+    feat: dict, dtype: str = "fp8", arch: str = "gfx950"
+) -> dict:
+    """Convert a feature-engine kernel dict to dispatcher KernelConfig fields.
+
+    Handles the naming inversion:
+        feature engine warp_m   -> KernelConfig wave_m  (warps per block)
+        feature engine warp_tile_m -> KernelConfig warp_m (elements per warp)
+    """
+    layout = feat.get("layout", "rcr")
+    la, lb, lc = LAYOUT_TO_DISPATCHER.get(layout, ("row", "col", "row"))
+    c_dtype = DTYPE_TO_C_DTYPE.get(dtype, dtype)
+
+    return {
+        "dtype_a": dtype,
+        "dtype_b": dtype,
+        "dtype_c": c_dtype,
+        "dtype_acc": "fp32",
+        "layout_a": la,
+        "layout_b": lb,
+        "layout_c": lc,
+        "tile_m": feat.get("tile_m", 128),
+        "tile_n": feat.get("tile_n", 128),
+        "tile_k": feat.get("tile_k", 64),
+        "wave_m": feat.get("warp_m", 2),
+        "wave_n": feat.get("warp_n", 2),
+        "wave_k": feat.get("warp_k", 1),
+        "warp_m": feat.get("warp_tile_m", 32),
+        "warp_n": feat.get("warp_tile_n", 32),
+        "warp_k": feat.get("warp_tile_k", 16),
+        "pipeline": feat.get("pipeline", "compv3"),
+        "scheduler": feat.get("scheduler", "intrawave"),
+        "epilogue": feat.get("epilogue", "cshuffle"),
+        "pad_m": feat.get("pad_m", True),
+        "pad_n": feat.get("pad_n", True),
+        "pad_k": feat.get("pad_k", True),
+        "gfx_arch": arch,
+    }
+
+
+def feature_dict_to_ml_spec(feat: dict, predicted_tflops: float = 0.0) -> MLKernelSpec:
+    """Convert a feature-engine kernel dict + prediction to an MLKernelSpec."""
+    return MLKernelSpec(
+        kernel_name=feat.get("kernel_name", "unknown"),
+        predicted_tflops=predicted_tflops,
+        tile_m=feat.get("tile_m", 128),
+        tile_n=feat.get("tile_n", 128),
+        tile_k=feat.get("tile_k", 64),
+        wave_m=feat.get("warp_m", 2),
+        wave_n=feat.get("warp_n", 2),
+        wave_k=feat.get("warp_k", 1),
+        warp_m=feat.get("warp_tile_m", 32),
+        warp_n=feat.get("warp_tile_n", 32),
+        warp_k=feat.get("warp_tile_k", 16),
+        pipeline=feat.get("pipeline", "compv3"),
+        scheduler=feat.get("scheduler", "intrawave"),
+        epilogue=feat.get("epilogue", "cshuffle"),
+        pad_m=feat.get("pad_m", False),
+        pad_n=feat.get("pad_n", False),
+        pad_k=feat.get("pad_k", False),
+        persistent=feat.get("persistent", False),
+    )
+
+
+def load_kernel_pool_from_binaries(bin_dir: str | Path) -> list[dict]:
+    """Discover benchmark executables and parse their names into feature dicts.
+
+    Each executable name encodes the full kernel config. This creates the
+    candidate pool for the ML heuristic without needing a registry JSON export.
+    """
+    bin_dir = Path(bin_dir)
+    configs = []
+    for exe in sorted(bin_dir.glob("benchmark_gemm_universal_*")):
+        name = exe.stem.replace("benchmark_", "")
+        feat = kernel_config_to_feature_dict(name)
+        if feat and "tile_m" in feat:
+            configs.append(feat)
+    return configs
+
+
+def create_ml_heuristic(
+    model_dir: str | Path,
+    dtype: str = "fp8",
+    arch: str = "gfx950",
+    layout: str = "rcr",
+    kernel_pool: Optional[list[dict]] = None,
+    bin_dir: Optional[str | Path] = None,
+):
+    """Create an ML heuristic function for kernel selection.
+
+    Returns a callable with signature:
+        (M: int, N: int, K: int) -> MLKernelSpec
+
+    The returned function scores all candidate kernels using the trained
+    LightGBM regressor and returns the best one as an MLKernelSpec.
+
+    Parameters
+    ----------
+    model_dir : str or Path
+        Path to trained model directory (must contain model_tflops.lgbm or
+        model_tflops_log_big.lgbm and feature_spec.json).
+    dtype : str
+        Data type for the problem (fp8, fp16, bf16).
+    arch : str
+        GPU architecture (gfx942, gfx950).
+    layout : str
+        Matrix layout (rcr, rrr, crr, ccr).
+    kernel_pool : list of dict, optional
+        Pre-parsed kernel configs. If None, loads from bin_dir.
+    bin_dir : str or Path, optional
+        Directory with benchmark executables. Used to build kernel_pool if
+        kernel_pool is not provided. Defaults to /workspace/ck_tile/bin.
+    """
+    model_dir = Path(model_dir)
+    predictor = Predictor(model_dir)
+
+    if kernel_pool is None:
+        if bin_dir is None:
+            bin_dir = Path("/workspace/ck_tile/bin")
+        kernel_pool = load_kernel_pool_from_binaries(bin_dir)
+
+    if not kernel_pool:
+        raise ValueError(
+            "No kernel configs found. Check bin_dir or provide kernel_pool."
+        )
+
+    def heuristic(M: int, N: int, K: int) -> MLKernelSpec:
+        problem = {
+            "m": M,
+            "n": N,
+            "k": K,
+            "dtype": dtype,
+            "layout": layout,
+            "split_k": 1,
+        }
+
+        ranked = predictor.rank_kernels(problem, kernel_pool)
+
+        if not ranked:
+            feat = kernel_pool[0]
+            return feature_dict_to_ml_spec(feat, 0.0)
+
+        best_name, best_tflops = ranked[0]
+        best_feat = next(
+            (kp for kp in kernel_pool if kp.get("kernel_name") == best_name),
+            kernel_pool[0],
+        )
+        return feature_dict_to_ml_spec(best_feat, best_tflops)
+
+    return heuristic
+
+
+def create_ranked_heuristic(
+    model_dir: str | Path,
+    dtype: str = "fp8",
+    arch: str = "gfx950",
+    layout: str = "rcr",
+    kernel_pool: Optional[list[dict]] = None,
+    bin_dir: Optional[str | Path] = None,
+    top_k: int = 5,
+):
+    """Create an ML heuristic that returns the top-K ranked kernel specs.
+
+    Returns a callable with signature:
+        (M: int, N: int, K: int) -> list[MLKernelSpec]
+
+    Useful when you want fallback options if the top-1 kernel fails to build.
+    """
+    model_dir = Path(model_dir)
+    predictor = Predictor(model_dir)
+
+    if kernel_pool is None:
+        if bin_dir is None:
+            bin_dir = Path("/workspace/ck_tile/bin")
+        kernel_pool = load_kernel_pool_from_binaries(bin_dir)
+
+    name_to_feat = {kp.get("kernel_name", ""): kp for kp in kernel_pool}
+
+    def heuristic(M: int, N: int, K: int) -> list[MLKernelSpec]:
+        problem = {
+            "m": M,
+            "n": N,
+            "k": K,
+            "dtype": dtype,
+            "layout": layout,
+            "split_k": 1,
+        }
+
+        ranked = predictor.rank_kernels(problem, kernel_pool)
+        results = []
+        for name, tflops in ranked[:top_k]:
+            feat = name_to_feat.get(name, kernel_pool[0])
+            results.append(feature_dict_to_ml_spec(feat, tflops))
+        return results
+
+    return heuristic
+
+
+def ml_spec_to_dispatcher_config(
+    spec: MLKernelSpec, dtype: str = "fp8", arch: str = "gfx950"
+) -> dict:
+    """Convert an MLKernelSpec to a dict compatible with ctypes_utils.KernelConfig."""
+    layout_a, layout_b, layout_c = "row", "col", "row"
+    c_dtype = DTYPE_TO_C_DTYPE.get(dtype, dtype)
+
+    return {
+        "dtype_a": dtype,
+        "dtype_b": dtype,
+        "dtype_c": c_dtype,
+        "dtype_acc": "fp32",
+        "layout_a": layout_a,
+        "layout_b": layout_b,
+        "layout_c": layout_c,
+        "tile_m": spec.tile_m,
+        "tile_n": spec.tile_n,
+        "tile_k": spec.tile_k,
+        "wave_m": spec.wave_m,
+        "wave_n": spec.wave_n,
+        "wave_k": spec.wave_k,
+        "warp_m": spec.warp_m,
+        "warp_n": spec.warp_n,
+        "warp_k": spec.warp_k,
+        "pipeline": spec.pipeline,
+        "scheduler": spec.scheduler,
+        "epilogue": spec.epilogue,
+        "pad_m": spec.pad_m,
+        "pad_n": spec.pad_n,
+        "pad_k": spec.pad_k,
+        "gfx_arch": arch,
+    }
diff --git a/dispatcher/heuristics/evaluate.py b/dispatcher/heuristics/evaluate.py
new file mode 100644
index 0000000000..95c850aaf5
--- /dev/null
+++ b/dispatcher/heuristics/evaluate.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Evaluation and reporting for CK Tile kernel performance models.
+
+Computes:
+  - Global metrics: TFLOPS efficiency (mean, p10, p50, min), R2, NDCG@1, Top-K hit rate
+  - Per-slice breakdowns: by layout, shape family, K-depth regime, pipeline
+  - Cross-target consistency checks
+  - Feature importance analysis
+
+Usage:
+    python evaluate.py --model_dir models/gemm_universal_fp8_gfx950 --data_dir data/
+"""
+
+import argparse
+import json
+
+import numpy as np
+import pandas as pd
+
+from data_pipeline import build_training_dataset
+from feature_engine import GemmUniversalFeatureEngine
+from predict import Predictor
+from train import compute_tflops_efficiency
+
+
+def classify_shape_family(m: int, n: int, k: int) -> str:
+    """Classify a GEMM shape into a family for sliced evaluation.
+
+    Families:
+      - tiny_m: M < 32 (single-token / very small batch inference)
+      - small_m: 32 <= M < 256
+      - medium_m: 256 <= M < 4096
+      - large_m: M >= 4096
+      - square: 0.5 <= M/N <= 2.0 and 0.5 <= M/K <= 2.0
+      - tall: M/N > 2.0
+      - wide: M/N < 0.5
+    """
+    if m < 32:
+        return "tiny_m"
+    elif m < 256:
+        return "small_m"
+    elif m < 4096:
+        return "medium_m"
+    elif m >= 4096:
+        return "large_m"
+    return "other"
+
+
+def classify_k_regime(k: int) -> str:
+    """Classify K dimension into depth regime."""
+    if k < 512:
+        return "shallow_k"
+    elif k < 4096:
+        return "medium_k"
+    else:
+        return "deep_k"
+
+
+def evaluate_model(
+    predictor: Predictor,
+    df: pd.DataFrame,
+    feature_engine: GemmUniversalFeatureEngine,
+) -> dict:
+    """Run full evaluation on a dataset. Returns a metrics dictionary.
+
+    Parameters
+    ----------
+    predictor : Predictor
+        Trained predictor with at least a TFLOPS model loaded.
+    df : pd.DataFrame
+        Benchmark data in canonical schema.
+    feature_engine : GemmUniversalFeatureEngine
+        Feature engine matching the trained model.
+
+    Returns
+    -------
+    dict with keys: global_metrics, shape_family_metrics, k_regime_metrics,
+                    pipeline_metrics, per_shape_efficiency.
+    """
+    valid = df[df["is_valid"].fillna(False) & (df["measured_tflops"] > 0)].copy()
+    valid = valid.reset_index(drop=True)
+
+    X = feature_engine.extract_batch(valid)
+    model = predictor._load_model("tflops")
+    if model is None:
+        raise FileNotFoundError("No TFLOPS model found")
+
+    # Predict and apply inverse log transform if model was trained in log-space
+    raw_pred = model.predict(X)
+    if "tflops" in predictor._log_targets:
+        valid["pred_tflops"] = np.expm1(raw_pred)
+    else:
+        # Clamp to non-negative even for non-log models
+        valid["pred_tflops"] = np.maximum(0.0, raw_pred)
+
+    y_true = valid["measured_tflops"].values
+    y_pred = valid["pred_tflops"].values
+
+    ss_res = np.sum((y_true - y_pred) ** 2)
+    ss_tot = np.sum((y_true - y_true.mean()) ** 2)
+    r2 = 1 - ss_res / max(ss_tot, 1e-10)
+    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
+    mae = np.mean(np.abs(y_true - y_pred))
+
+    eff_df = compute_tflops_efficiency(valid, "pred_tflops")
+
+    ndcg1_count = 0
+    total_shapes = 0
+    topk_hits = {3: 0, 5: 0, 10: 0}
+
+    for (m, n, k), group in valid.groupby(["m", "n", "k"]):
+        if group["measured_tflops"].max() <= 0:
+            continue
+        total_shapes += 1
+        oracle_idx = group["measured_tflops"].idxmax()
+        pred_ranking = group.sort_values("pred_tflops", ascending=False).index.tolist()
+
+        if pred_ranking[0] == oracle_idx:
+            ndcg1_count += 1
+
+        oracle_rank = pred_ranking.index(oracle_idx)
+        for topk in topk_hits:
+            if oracle_rank < topk:
+                topk_hits[topk] += 1
+
+    global_metrics = {
+        "r2": r2,
+        "rmse": rmse,
+        "mae": mae,
+        "num_valid_rows": len(valid),
+        "num_shapes": total_shapes,
+        "efficiency_mean": float(eff_df["efficiency"].mean()) if len(eff_df) > 0 else 0,
+        "efficiency_p10": float(eff_df["efficiency"].quantile(0.1))
+        if len(eff_df) > 0
+        else 0,
+        "efficiency_p50": float(eff_df["efficiency"].quantile(0.5))
+        if len(eff_df) > 0
+        else 0,
+        "efficiency_min": float(eff_df["efficiency"].min()) if len(eff_df) > 0 else 0,
+        "ndcg_at_1": ndcg1_count / max(total_shapes, 1),
+        "top3_hit_rate": topk_hits[3] / max(total_shapes, 1),
+        "top5_hit_rate": topk_hits[5] / max(total_shapes, 1),
+        "top10_hit_rate": topk_hits[10] / max(total_shapes, 1),
+    }
+
+    def _slice_efficiency(slice_df):
+        if len(slice_df) == 0:
+            return {"count": 0}
+        eff = compute_tflops_efficiency(slice_df, "pred_tflops")
+        if len(eff) == 0:
+            return {"count": 0}
+        return {
+            "count": len(eff),
+            "mean": float(eff["efficiency"].mean()),
+            "p10": float(eff["efficiency"].quantile(0.1)),
+            "min": float(eff["efficiency"].min()),
+        }
+
+    valid["shape_family"] = valid.apply(
+        lambda r: classify_shape_family(r["m"], r["n"], r["k"]), axis=1
+    )
+    valid["k_regime"] = valid["k"].apply(classify_k_regime)
+
+    shape_family_metrics = {}
+    for family, group in valid.groupby("shape_family"):
+        shape_family_metrics[family] = _slice_efficiency(group)
+
+    k_regime_metrics = {}
+    for regime, group in valid.groupby("k_regime"):
+        k_regime_metrics[regime] = _slice_efficiency(group)
+
+    pipeline_metrics = {}
+    if "pipeline" in valid.columns:
+        for pipeline, group in valid.groupby("pipeline"):
+            pipeline_metrics[str(pipeline)] = _slice_efficiency(group)
+
+    return {
+        "global_metrics": global_metrics,
+        "shape_family_metrics": shape_family_metrics,
+        "k_regime_metrics": k_regime_metrics,
+        "pipeline_metrics": pipeline_metrics,
+        "per_shape_efficiency": eff_df.to_dict(orient="records")
+        if len(eff_df) > 0
+        else [],
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate CK Tile performance model")
+    parser.add_argument(
+        "--model_dir", required=True, help="Directory with trained models"
+    )
+    parser.add_argument("--data_dir", required=True, help="Directory with parquet data")
+    parser.add_argument("--op", default="gemm_universal")
+    parser.add_argument("--dtype", default="fp8")
+    parser.add_argument("--output", "-o", help="Output JSON path for metrics")
+    args = parser.parse_args()
+
+    print(f"Loading data from {args.data_dir}...")
+    df = build_training_dataset(args.data_dir, op_type=args.op, dtype=args.dtype)
+    print(f"  {len(df)} rows, {df.groupby(['m', 'n', 'k']).ngroups} shapes")
+
+    fe = GemmUniversalFeatureEngine()
+    predictor = Predictor(args.model_dir, feature_engine=fe)
+
+    print("Evaluating...")
+    results = evaluate_model(predictor, df, fe)
+
+    gm = results["global_metrics"]
+    print("\nGlobal Metrics:")
+    print(f"  R2:             {gm['r2']:.4f}")
+    print(f"  RMSE:           {gm['rmse']:.2f}")
+    print(f"  Efficiency Mean: {gm['efficiency_mean']:.4f}")
+    print(f"  Efficiency P10:  {gm['efficiency_p10']:.4f}")
+    print(f"  Efficiency P50:  {gm['efficiency_p50']:.4f}")
+    print(f"  Efficiency Min:  {gm['efficiency_min']:.4f}")
+    print(f"  NDCG@1:          {gm['ndcg_at_1']:.4f}")
+    print(f"  Top-3 Hit Rate:  {gm['top3_hit_rate']:.4f}")
+    print(f"  Top-5 Hit Rate:  {gm['top5_hit_rate']:.4f}")
+    print(f"  Top-10 Hit Rate: {gm['top10_hit_rate']:.4f}")
+
+    print("\nShape Family Breakdown:")
+    for family, metrics in sorted(results["shape_family_metrics"].items()):
+        if metrics.get("count", 0) > 0:
+            print(
+                f"  {family:12s}: mean={metrics['mean']:.4f} p10={metrics['p10']:.4f} min={metrics['min']:.4f} (n={metrics['count']})"
+            )
+
+    print("\nK-Depth Regime Breakdown:")
+    for regime, metrics in sorted(results["k_regime_metrics"].items()):
+        if metrics.get("count", 0) > 0:
+            print(
+                f"  {regime:12s}: mean={metrics['mean']:.4f} p10={metrics['p10']:.4f} min={metrics['min']:.4f} (n={metrics['count']})"
+            )
+
+    print("\nPipeline Breakdown:")
+    for pipeline, metrics in sorted(results["pipeline_metrics"].items()):
+        if metrics.get("count", 0) > 0:
+            print(
+                f"  {pipeline:15s}: mean={metrics['mean']:.4f} p10={metrics['p10']:.4f} (n={metrics['count']})"
+            )
+
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump(results, f, indent=2, default=str)
+        print(f"\nFull results saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/feature_engine.py b/dispatcher/heuristics/feature_engine.py
new file mode 100644
index 0000000000..557d9d8992
--- /dev/null
+++ b/dispatcher/heuristics/feature_engine.py
@@ -0,0 +1,577 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Feature engineering for CK Tile kernel performance prediction.
+
+Provides a strict FeatureEngine interface with per-op subclasses.
+All feature engines produce a consistent numpy array for LightGBM.
+"""
+
+import math
+from abc import ABC, abstractmethod
+
+import numpy as np
+import pandas as pd
+
+
+DTYPE_BYTES = {
+    "fp32": 4.0,
+    "fp16": 2.0,
+    "bf16": 2.0,
+    "fp8": 1.0,
+    "bf8": 1.0,
+    "int8": 1.0,
+    "int4": 0.5,
+}
+
+LAYOUT_MAP = {"rcr": 0, "rrr": 1, "crr": 2, "ccr": 3}
+PIPELINE_MAP = {"compv3": 0, "compv4": 1, "compv5": 2, "mem": 3, "preshufflev2": 4}
+SCHEDULER_MAP = {"intrawave": 0, "interwave": 1}
+EPILOGUE_MAP = {"default": 0, "cshuffle": 1}
+
+
+class FeatureEngine(ABC):
+    """Abstract base for per-op feature extraction."""
+
+    @abstractmethod
+    def get_feature_names(self) -> list[str]:
+        """Ordered list of feature names matching the output array columns."""
+        ...
+
+    @abstractmethod
+    def get_categorical_features(self) -> list[str]:
+        """Feature names that should be treated as categorical by LightGBM."""
+        ...
+
+    @abstractmethod
+    def extract(self, problem: dict, kernel: dict) -> np.ndarray:
+        """Extract a single feature vector from a (problem, kernel) pair."""
+        ...
+
+    def extract_batch(self, df: pd.DataFrame) -> np.ndarray:
+        """Vectorized batch extraction from a DataFrame. Override for speed."""
+        names = self.get_feature_names()
+        result = np.zeros((len(df), len(names)), dtype=np.float64)
+        for i in range(len(df)):
+            row = df.iloc[i]
+            prob = row.to_dict()
+            kern = row.to_dict()
+            result[i] = self.extract(prob, kern)
+        return result
+
+    def get_parameter_space(self) -> dict[str, list]:
+        """Valid discrete values for each kernel parameter (for surrogate search)."""
+        return {}
+
+    def get_constraints(self) -> list:
+        """Multi-param constraint functions returning True if config is valid."""
+        return []
+
+    def validate_config(self, config: dict) -> bool:
+        """Check all constraints. Returns True if the config is valid."""
+        ps = self.get_parameter_space()
+        for k, valid_vals in ps.items():
+            if k in config and config[k] not in valid_vals:
+                return False
+        for constraint in self.get_constraints():
+            if not constraint(config):
+                return False
+        return True
+
+    def project_to_valid(self, config: dict) -> dict:
+        """Snap a config to the nearest valid discrete point."""
+        ps = self.get_parameter_space()
+        result = dict(config)
+        for k, valid_vals in ps.items():
+            if k not in result:
+                continue
+            v = result[k]
+            if isinstance(valid_vals[0], (int, float)):
+                result[k] = min(valid_vals, key=lambda x: abs(x - v))
+            elif v not in valid_vals:
+                result[k] = valid_vals[0]
+        return result
+
+
+class GemmUniversalFeatureEngine(FeatureEngine):
+    """Feature engine for gemm_universal kernels."""
+
+    def __init__(
+        self,
+        num_cus: int = 256,
+        lds_capacity: int = 65536,
+        max_clock_mhz: int = 2400,
+        simds_per_cu: int = 4,
+        shader_engines: int = 32,
+        max_waves_per_cu: int = 32,
+        wavefront_size: int = 64,
+        l1_cache_kb: int = 32,
+        l2_cache_kb: int = 4096,
+        l3_cache_kb: int = 262144,
+        num_xcd: int = 8,
+    ):
+        self._hw = {
+            "num_cus": num_cus,
+            "lds_capacity": lds_capacity,
+            "max_clock_mhz": max_clock_mhz,
+            "simds_per_cu": simds_per_cu,
+            "shader_engines": shader_engines,
+            "max_waves_per_cu": max_waves_per_cu,
+            "wavefront_size": wavefront_size,
+            "l1_cache_kb": l1_cache_kb,
+            "l2_cache_kb": l2_cache_kb,
+            "l3_cache_kb": l3_cache_kb,
+            "num_xcd": num_xcd,
+            "total_simds": num_cus * simds_per_cu,
+        }
+
+    def get_feature_names(self) -> list[str]:
+        return [
+            # Problem features
+            "M",
+            "N",
+            "K",
+            "split_k",
+            "log2_M",
+            "log2_N",
+            "log2_K",
+            "log2_MNK",
+            "arithmetic_intensity",
+            "aspect_ratio_mn",
+            "aspect_ratio_mk",
+            "aspect_ratio_nk",
+            "layout",
+            # Kernel features
+            "tile_m",
+            "tile_n",
+            "tile_k",
+            "warp_m",
+            "warp_n",
+            "warp_k",
+            "warp_tile_m",
+            "warp_tile_n",
+            "warp_tile_k",
+            "pipeline",
+            "scheduler",
+            "epilogue",
+            "pad_m",
+            "pad_n",
+            "pad_k",
+            "persistent",
+            "num_warps",
+            "tile_volume",
+            "tile_mn",
+            "lds_usage_estimate",
+            "lds_usage_ratio",
+            # Interaction features
+            "num_tiles_m",
+            "num_tiles_n",
+            "num_tiles_k",
+            "total_output_tiles",
+            "tile_eff_m",
+            "tile_eff_n",
+            "tile_eff_k",
+            "overall_tile_efficiency",
+            "cu_utilization",
+            # P0 FIX: Problem-to-tile ratio features
+            "ratio_M_to_tile_m",
+            "ratio_N_to_tile_n",
+            "ratio_K_to_tile_k",
+            "problem_smaller_than_tile_m",
+            "problem_smaller_than_tile_n",
+            "problem_smaller_than_tile_k",
+            "any_dim_too_small",
+            # P1 FIX: Padding requirement interaction features
+            "needs_padding_m",
+            "needs_padding_n",
+            "needs_padding_k",
+            "has_padding_when_needed_m",
+            "has_padding_when_needed_n",
+            "has_padding_when_needed_k",
+            "missing_required_padding_m",
+            "missing_required_padding_n",
+            "missing_required_padding_k",
+            "missing_any_required_padding",
+            # Hardware features
+            "hw_num_cus",
+            "hw_simds_per_cu",
+            "hw_total_simds",
+            "hw_shader_engines",
+            "hw_max_clock_mhz",
+            "hw_max_waves_per_cu",
+            "hw_wavefront_size",
+            "hw_lds_capacity",
+            "hw_l1_cache_kb",
+            "hw_l2_cache_kb",
+            "hw_l3_cache_kb",
+            "hw_num_xcd",
+        ]
+
+    def get_categorical_features(self) -> list[str]:
+        return ["layout", "pipeline", "scheduler", "epilogue"]
+
+    def extract(self, problem: dict, kernel: dict) -> np.ndarray:
+        M = int(problem.get("m", problem.get("M", 0)))
+        N = int(problem.get("n", problem.get("N", 0)))
+        K = int(problem.get("k", problem.get("K", 0)))
+        split_k = int(problem.get("split_k", 1))
+        dtype = str(problem.get("dtype", "fp8"))
+        bpe = DTYPE_BYTES.get(dtype, 1.0)
+
+        log2_M = math.log2(max(M, 1))
+        log2_N = math.log2(max(N, 1))
+        log2_K = math.log2(max(K, 1))
+        log2_MNK = math.log2(max(M * N * K, 1))
+
+        mem_bytes = (M * K + K * N + M * N) * bpe
+        ai = (2.0 * M * N * K) / max(mem_bytes, 1)
+
+        ar_mn = M / max(N, 1)
+        ar_mk = M / max(K, 1)
+        ar_nk = N / max(K, 1)
+
+        layout_code = LAYOUT_MAP.get(str(problem.get("layout", "rcr")), 0)
+
+        tile_m = int(kernel.get("tile_m", 128))
+        tile_n = int(kernel.get("tile_n", 128))
+        tile_k = int(kernel.get("tile_k", 64))
+        warp_m = int(kernel.get("warp_m", 2))
+        warp_n = int(kernel.get("warp_n", 2))
+        warp_k = int(kernel.get("warp_k", 1))
+        warp_tile_m = int(kernel.get("warp_tile_m", 32))
+        warp_tile_n = int(kernel.get("warp_tile_n", 32))
+        warp_tile_k = int(kernel.get("warp_tile_k", 16))
+
+        pipeline_code = PIPELINE_MAP.get(str(kernel.get("pipeline", "compv4")), 0)
+        scheduler_code = SCHEDULER_MAP.get(str(kernel.get("scheduler", "intrawave")), 0)
+        epilogue_code = EPILOGUE_MAP.get(str(kernel.get("epilogue", "cshuffle")), 0)
+
+        pad_m = float(kernel.get("pad_m", False))
+        pad_n = float(kernel.get("pad_n", False))
+        pad_k = float(kernel.get("pad_k", False))
+        persistent = float(kernel.get("persistent", False))
+
+        num_warps = warp_m * warp_n * warp_k
+        tile_volume = tile_m * tile_n * tile_k
+        tile_mn = tile_m * tile_n
+
+        lds_est = (tile_m * tile_k + tile_n * tile_k) * bpe
+        lds_cap = self._hw["lds_capacity"]
+        if str(kernel.get("pipeline", "")).startswith("compv4"):
+            lds_cap = 32768
+        lds_ratio = lds_est / max(lds_cap, 1)
+
+        num_tiles_m = math.ceil(M / max(tile_m, 1))
+        num_tiles_n = math.ceil(N / max(tile_n, 1))
+        num_tiles_k = math.ceil(K / max(tile_k, 1))
+        total_output_tiles = num_tiles_m * num_tiles_n
+
+        rem_m = M % tile_m if tile_m > 0 else 0
+        tile_eff_m = rem_m / tile_m if rem_m > 0 else 1.0
+        rem_n = N % tile_n if tile_n > 0 else 0
+        tile_eff_n = rem_n / tile_n if rem_n > 0 else 1.0
+        rem_k = K % tile_k if tile_k > 0 else 0
+        tile_eff_k = rem_k / tile_k if rem_k > 0 else 1.0
+        overall_eff = tile_eff_m * tile_eff_n * tile_eff_k
+
+        cu_util = total_output_tiles / max(self._hw["num_cus"], 1)
+
+        # P0 FIX: Problem-to-tile ratio features (avoid oversized tiles for tiny problems)
+        ratio_M_to_tile_m = M / max(tile_m, 1)
+        ratio_N_to_tile_n = N / max(tile_n, 1)
+        ratio_K_to_tile_k = K / max(tile_k, 1)
+
+        # Binary features: is problem dimension smaller than tile?
+        problem_smaller_than_tile_m = float(M < tile_m)
+        problem_smaller_than_tile_n = float(N < tile_n)
+        problem_smaller_than_tile_k = float(K < tile_k)
+        any_dim_too_small = float((M < tile_m) or (N < tile_n) or (K < tile_k))
+
+        # P1 FIX: Padding requirement features (does this kernel have padding when needed?)
+        needs_padding_m = float(M % tile_m != 0) if tile_m > 0 else 0.0
+        needs_padding_n = float(N % tile_n != 0) if tile_n > 0 else 0.0
+        needs_padding_k = float(K % tile_k != 0) if tile_k > 0 else 0.0
+
+        # Interaction features: kernel has padding capability when problem needs it
+        has_padding_when_needed_m = float(needs_padding_m and pad_m)
+        has_padding_when_needed_n = float(needs_padding_n and pad_n)
+        has_padding_when_needed_k = float(needs_padding_k and pad_k)
+
+        # Critical feature: missing required padding (kernel will likely fail)
+        missing_required_padding_m = float(needs_padding_m and not pad_m)
+        missing_required_padding_n = float(needs_padding_n and not pad_n)
+        missing_required_padding_k = float(needs_padding_k and not pad_k)
+        missing_any_required_padding = float(
+            missing_required_padding_m
+            or missing_required_padding_n
+            or missing_required_padding_k
+        )
+
+        hw = self._hw
+        return np.array(
+            [
+                M,
+                N,
+                K,
+                split_k,
+                log2_M,
+                log2_N,
+                log2_K,
+                log2_MNK,
+                ai,
+                ar_mn,
+                ar_mk,
+                ar_nk,
+                layout_code,
+                tile_m,
+                tile_n,
+                tile_k,
+                warp_m,
+                warp_n,
+                warp_k,
+                warp_tile_m,
+                warp_tile_n,
+                warp_tile_k,
+                pipeline_code,
+                scheduler_code,
+                epilogue_code,
+                pad_m,
+                pad_n,
+                pad_k,
+                persistent,
+                num_warps,
+                tile_volume,
+                tile_mn,
+                lds_est,
+                lds_ratio,
+                num_tiles_m,
+                num_tiles_n,
+                num_tiles_k,
+                total_output_tiles,
+                tile_eff_m,
+                tile_eff_n,
+                tile_eff_k,
+                overall_eff,
+                cu_util,
+                # P0 FIX: New ratio and binary features
+                ratio_M_to_tile_m,
+                ratio_N_to_tile_n,
+                ratio_K_to_tile_k,
+                problem_smaller_than_tile_m,
+                problem_smaller_than_tile_n,
+                problem_smaller_than_tile_k,
+                any_dim_too_small,
+                # P1 FIX: Padding requirement interaction features
+                needs_padding_m,
+                needs_padding_n,
+                needs_padding_k,
+                has_padding_when_needed_m,
+                has_padding_when_needed_n,
+                has_padding_when_needed_k,
+                missing_required_padding_m,
+                missing_required_padding_n,
+                missing_required_padding_k,
+                missing_any_required_padding,
+                hw["num_cus"],
+                hw["simds_per_cu"],
+                hw["total_simds"],
+                hw["shader_engines"],
+                hw["max_clock_mhz"],
+                hw["max_waves_per_cu"],
+                hw["wavefront_size"],
+                hw["lds_capacity"],
+                hw["l1_cache_kb"],
+                hw["l2_cache_kb"],
+                hw["l3_cache_kb"],
+                hw["num_xcd"],
+            ],
+            dtype=np.float64,
+        )
+
+    def extract_batch(self, df: pd.DataFrame) -> np.ndarray:
+        """Vectorized batch extraction -- much faster than row-by-row."""
+        n = len(df)
+        names = self.get_feature_names()
+        result = np.zeros((n, len(names)), dtype=np.float64)
+
+        M = df["m"].values.astype(np.float64)
+        N = df["n"].values.astype(np.float64)
+        K = df["k"].values.astype(np.float64)
+        split_k = df["split_k"].fillna(1).values.astype(np.float64)
+
+        dtype_col = df["dtype"].fillna("fp8")
+        bpe = dtype_col.map(DTYPE_BYTES).fillna(1.0).values
+
+        result[:, 0] = M
+        result[:, 1] = N
+        result[:, 2] = K
+        result[:, 3] = split_k
+        result[:, 4] = np.log2(np.maximum(M, 1))
+        result[:, 5] = np.log2(np.maximum(N, 1))
+        result[:, 6] = np.log2(np.maximum(K, 1))
+        result[:, 7] = np.log2(np.maximum(M * N * K, 1))
+
+        mem = (M * K + K * N + M * N) * bpe
+        result[:, 8] = (2.0 * M * N * K) / np.maximum(mem, 1)
+        result[:, 9] = M / np.maximum(N, 1)
+        result[:, 10] = M / np.maximum(K, 1)
+        result[:, 11] = N / np.maximum(K, 1)
+
+        result[:, 12] = df["layout"].map(LAYOUT_MAP).fillna(0).values
+
+        tile_m = df["tile_m"].fillna(128).values.astype(np.float64)
+        tile_n = df["tile_n"].fillna(128).values.astype(np.float64)
+        tile_k = df["tile_k"].fillna(64).values.astype(np.float64)
+        warp_m = df["warp_m"].fillna(2).values.astype(np.float64)
+        warp_n = df["warp_n"].fillna(2).values.astype(np.float64)
+        warp_k = df["warp_k"].fillna(1).values.astype(np.float64)
+        warp_tile_m = df["warp_tile_m"].fillna(32).values.astype(np.float64)
+        warp_tile_n = df["warp_tile_n"].fillna(32).values.astype(np.float64)
+        warp_tile_k = df["warp_tile_k"].fillna(16).values.astype(np.float64)
+
+        result[:, 13] = tile_m
+        result[:, 14] = tile_n
+        result[:, 15] = tile_k
+        result[:, 16] = warp_m
+        result[:, 17] = warp_n
+        result[:, 18] = warp_k
+        result[:, 19] = warp_tile_m
+        result[:, 20] = warp_tile_n
+        result[:, 21] = warp_tile_k
+
+        result[:, 22] = df["pipeline"].map(PIPELINE_MAP).fillna(0).values
+        result[:, 23] = df["scheduler"].map(SCHEDULER_MAP).fillna(0).values
+        result[:, 24] = df["epilogue"].map(EPILOGUE_MAP).fillna(0).values
+
+        result[:, 25] = df["pad_m"].fillna(False).astype(float).values
+        result[:, 26] = df["pad_n"].fillna(False).astype(float).values
+        result[:, 27] = df["pad_k"].fillna(False).astype(float).values
+        result[:, 28] = df["persistent"].fillna(False).astype(float).values
+
+        num_warps = warp_m * warp_n * warp_k
+        result[:, 29] = num_warps
+        result[:, 30] = tile_m * tile_n * tile_k
+        result[:, 31] = tile_m * tile_n
+
+        lds_est = (tile_m * tile_k + tile_n * tile_k) * bpe
+        result[:, 32] = lds_est
+        lds_cap = np.full(n, self._hw["lds_capacity"], dtype=np.float64)
+        is_compv4 = df["pipeline"].fillna("").str.startswith("compv4")
+        lds_cap[is_compv4] = 32768
+        result[:, 33] = lds_est / np.maximum(lds_cap, 1)
+
+        ntm = np.ceil(M / np.maximum(tile_m, 1))
+        ntn = np.ceil(N / np.maximum(tile_n, 1))
+        ntk = np.ceil(K / np.maximum(tile_k, 1))
+        result[:, 34] = ntm
+        result[:, 35] = ntn
+        result[:, 36] = ntk
+        result[:, 37] = ntm * ntn
+
+        rem_m = np.mod(M, np.maximum(tile_m, 1))
+        result[:, 38] = np.where(rem_m > 0, rem_m / tile_m, 1.0)
+        rem_n = np.mod(N, np.maximum(tile_n, 1))
+        result[:, 39] = np.where(rem_n > 0, rem_n / tile_n, 1.0)
+        rem_k = np.mod(K, np.maximum(tile_k, 1))
+        result[:, 40] = np.where(rem_k > 0, rem_k / tile_k, 1.0)
+        result[:, 41] = result[:, 38] * result[:, 39] * result[:, 40]
+
+        result[:, 42] = (ntm * ntn) / max(self._hw["num_cus"], 1)
+
+        # P0 FIX: Problem-to-tile ratio features
+        result[:, 43] = M / np.maximum(tile_m, 1)  # ratio_M_to_tile_m
+        result[:, 44] = N / np.maximum(tile_n, 1)  # ratio_N_to_tile_n
+        result[:, 45] = K / np.maximum(tile_k, 1)  # ratio_K_to_tile_k
+
+        # Binary features: is problem smaller than tile?
+        result[:, 46] = (M < tile_m).astype(float)  # problem_smaller_than_tile_m
+        result[:, 47] = (N < tile_n).astype(float)  # problem_smaller_than_tile_n
+        result[:, 48] = (K < tile_k).astype(float)  # problem_smaller_than_tile_k
+        result[:, 49] = ((M < tile_m) | (N < tile_n) | (K < tile_k)).astype(
+            float
+        )  # any_dim_too_small
+
+        # P1 FIX: Padding requirement features
+        pad_m_bool = df["pad_m"].fillna(False).astype(bool).values
+        pad_n_bool = df["pad_n"].fillna(False).astype(bool).values
+        pad_k_bool = df["pad_k"].fillna(False).astype(bool).values
+
+        needs_padding_m = (np.mod(M, np.maximum(tile_m, 1)) != 0)
+        needs_padding_n = (np.mod(N, np.maximum(tile_n, 1)) != 0)
+        needs_padding_k = (np.mod(K, np.maximum(tile_k, 1)) != 0)
+
+        result[:, 50] = needs_padding_m.astype(float)
+        result[:, 51] = needs_padding_n.astype(float)
+        result[:, 52] = needs_padding_k.astype(float)
+
+        # Interaction features: kernel has padding when problem needs it
+        result[:, 53] = (needs_padding_m & pad_m_bool).astype(float)  # has_padding_when_needed_m
+        result[:, 54] = (needs_padding_n & pad_n_bool).astype(float)  # has_padding_when_needed_n
+        result[:, 55] = (needs_padding_k & pad_k_bool).astype(float)  # has_padding_when_needed_k
+
+        # Critical feature: missing required padding
+        result[:, 56] = (needs_padding_m & ~pad_m_bool).astype(float)  # missing_required_padding_m
+        result[:, 57] = (needs_padding_n & ~pad_n_bool).astype(float)  # missing_required_padding_n
+        result[:, 58] = (needs_padding_k & ~pad_k_bool).astype(float)  # missing_required_padding_k
+        result[:, 59] = ((needs_padding_m & ~pad_m_bool) | (needs_padding_n & ~pad_n_bool) | (needs_padding_k & ~pad_k_bool)).astype(float)  # missing_any_required_padding
+
+        # Hardware profile features
+        hw = self._hw
+        result[:, 60] = hw["num_cus"]
+        result[:, 61] = hw["simds_per_cu"]
+        result[:, 62] = hw["total_simds"]
+        result[:, 63] = hw["shader_engines"]
+        result[:, 64] = hw["max_clock_mhz"]
+        result[:, 65] = hw["max_waves_per_cu"]
+        result[:, 66] = hw["wavefront_size"]
+        result[:, 67] = hw["lds_capacity"]
+        result[:, 68] = hw["l1_cache_kb"]
+        result[:, 69] = hw["l2_cache_kb"]
+        result[:, 70] = hw["l3_cache_kb"]
+        result[:, 71] = hw["num_xcd"]
+
+        return result
+
+    def get_parameter_space(self) -> dict[str, list]:
+        return {
+            "tile_m": [32, 64, 128, 192, 256],
+            "tile_n": [32, 64, 128, 192, 256],
+            "tile_k": [32, 64, 128, 256],
+            "warp_m": [1, 2, 4],
+            "warp_n": [1, 2, 4],
+            "warp_k": [1],
+            "warp_tile_m": [4, 16, 32, 64],
+            "warp_tile_n": [4, 16, 32, 64],
+            "warp_tile_k": [8, 16, 32, 64, 128],
+            "pipeline": list(PIPELINE_MAP.keys()),
+            "scheduler": list(SCHEDULER_MAP.keys()),
+            "epilogue": list(EPILOGUE_MAP.keys()),
+            "pad_m": [True, False],
+            "pad_n": [True, False],
+            "pad_k": [True, False],
+            "persistent": [True, False],
+        }
+
+    def get_constraints(self) -> list:
+        lds_cap = self._hw["lds_capacity"]
+
+        def _lds_constraint(cfg):
+            tm = cfg.get("tile_m", 128)
+            tn = cfg.get("tile_n", 128)
+            tk = cfg.get("tile_k", 64)
+            bpe = 1.0  # fp8 default
+            est = (tm * tk + tn * tk) * bpe
+            cap = (
+                32768 if str(cfg.get("pipeline", "")).startswith("compv4") else lds_cap
+            )
+            return est <= cap
+
+        def _warp_constraint(cfg):
+            wm = cfg.get("warp_m", 2)
+            wn = cfg.get("warp_n", 2)
+            wk = cfg.get("warp_k", 1)
+            return (wm * wn * wk) in [2, 4, 8]
+
+        return [_lds_constraint, _warp_constraint]
diff --git a/dispatcher/heuristics/generate_benchmark_data.py b/dispatcher/heuristics/generate_benchmark_data.py
new file mode 100644
index 0000000000..17c76e5967
--- /dev/null
+++ b/dispatcher/heuristics/generate_benchmark_data.py
@@ -0,0 +1,553 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+GEMM Universal Benchmark Data Generation Script
+
+This script generates training data for ML-based kernel selection heuristics by:
+1. Reading kernel configurations from the tile engine
+2. Building benchmark executables (in parallel)
+3. Running benchmarks across multiple problem sizes
+4. Outputting performance data in JSON format
+
+Usage:
+    python generate_benchmark_data.py \
+        --build_dir /tmp/build \
+        --output_dir /tmp/benchmark_data \
+        --dtype fp16 \
+        --layout rcr \
+        --num_build_jobs 4 \
+        --num_benchmark_jobs 1
+
+Requirements:
+    - ROCm-capable GPU
+    - CK tile engine built with CMake
+"""
+
+import argparse
+import json
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import List, Optional, Tuple
+import re
+
+
+@dataclass
+class KernelConfig:
+    """Represents a single kernel configuration."""
+
+    name: str
+    dtype: str
+    layout: str
+    pipeline: str
+    epilogue: str
+    scheduler: str
+    pad_m: bool
+    pad_n: bool
+    pad_k: bool
+    persistent: bool
+    tile_m: int
+    tile_n: int
+    tile_k: int
+    warp_m: int
+    warp_n: int
+    warp_k: int
+    warp_tile_m: int
+    warp_tile_n: int
+    warp_tile_k: int
+
+    @classmethod
+    def from_kernel_name(cls, name: str, dtype: str, layout: str) -> "KernelConfig":
+        """Parse kernel name to extract configuration."""
+        # Format: gemm_universal_{dtype}_{layout}_{pipeline}_{epilogue}_{scheduler}_{padM}_{padN}_{padK}_{persistent}_{tile_config}
+        # tile_config: {tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}
+
+        parts = name.split("_")
+        prefix = f"gemm_universal_{dtype}_{layout}_"
+        trait_and_tile = name[len(prefix) :]
+        trait_parts = trait_and_tile.split("_")
+
+        pipeline = trait_parts[0]
+        epilogue = trait_parts[1]
+        scheduler = trait_parts[2]
+        pad_m = trait_parts[3] == "True"
+        pad_n = trait_parts[4] == "True"
+        pad_k = trait_parts[5] == "True"
+        persistent = trait_parts[6] == "True"
+
+        # Parse tile config
+        tile_dims = trait_parts[7].split("x")
+        warp_dims = trait_parts[8].split("x")
+        warp_tile_dims = trait_parts[9].split("x")
+
+        return cls(
+            name=name,
+            dtype=dtype,
+            layout=layout,
+            pipeline=pipeline,
+            epilogue=epilogue,
+            scheduler=scheduler,
+            pad_m=pad_m,
+            pad_n=pad_n,
+            pad_k=pad_k,
+            persistent=persistent,
+            tile_m=int(tile_dims[0]),
+            tile_n=int(tile_dims[1]),
+            tile_k=int(tile_dims[2]),
+            warp_m=int(warp_dims[0]),
+            warp_n=int(warp_dims[1]),
+            warp_k=int(warp_dims[2]),
+            warp_tile_m=int(warp_tile_dims[0]),
+            warp_tile_n=int(warp_tile_dims[1]),
+            warp_tile_k=int(warp_tile_dims[2]),
+        )
+
+
+@dataclass
+class BenchmarkResult:
+    """Result of a single benchmark run."""
+
+    kernel_name: str
+    m: int
+    n: int
+    k: int
+    avg_time_ms: float
+    tflops: float
+    is_valid: bool
+    error: Optional[str] = None
+
+
+@dataclass
+class ProblemSize:
+    """GEMM problem dimensions."""
+
+    m: int
+    n: int
+    k: int
+
+
+def get_problem_sizes() -> List[ProblemSize]:
+    """
+    Generate diverse problem sizes for benchmarking.
+
+    Includes:
+    - Square matrices (powers of 2)
+    - Rectangular matrices (common in ML)
+    - LLM-specific sizes (attention, MLP)
+    - Edge cases (small, very large)
+    """
+    sizes = []
+
+    # Powers of 2 (square)
+    for p in [6, 7, 8, 9, 10, 11, 12, 13]:  # 64 to 8192
+        dim = 2**p
+        sizes.append(ProblemSize(dim, dim, dim))
+
+    # Common ML sizes (batch x hidden)
+    ml_sizes = [
+        (1, 4096, 4096),  # Single token inference
+        (8, 4096, 4096),  # Small batch
+        (32, 4096, 4096),  # Medium batch
+        (128, 4096, 4096),  # Large batch
+        (1, 4096, 11008),  # LLaMA MLP up-projection
+        (1, 11008, 4096),  # LLaMA MLP down-projection
+        (32, 4096, 11008),
+        (32, 11008, 4096),
+        (1, 8192, 8192),  # Large model
+        (32, 8192, 8192),
+        (1, 8192, 28672),  # LLaMA-70B MLP
+        (32, 8192, 28672),
+    ]
+    for m, n, k in ml_sizes:
+        sizes.append(ProblemSize(m, n, k))
+
+    # Rectangular matrices
+    rect_sizes = [
+        (1024, 4096, 1024),
+        (4096, 1024, 4096),
+        (2048, 8192, 2048),
+        (256, 256, 8192),  # Tall K
+        (8192, 8192, 256),  # Short K
+    ]
+    for m, n, k in rect_sizes:
+        sizes.append(ProblemSize(m, n, k))
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_sizes = []
+    for s in sizes:
+        key = (s.m, s.n, s.k)
+        if key not in seen:
+            seen.add(key)
+            unique_sizes.append(s)
+
+    return unique_sizes
+
+
+def load_kernel_list(build_dir: Path, dtype: str, layout: str) -> List[KernelConfig]:
+    """Load kernel configurations from the tile engine build."""
+    kernel_list_path = (
+        build_dir
+        / "tile_engine"
+        / "ops"
+        / "gemm"
+        / "gemm_universal"
+        / dtype
+        / layout
+        / "gemm_universal_kernel_list.txt"
+    )
+
+    if not kernel_list_path.exists():
+        raise FileNotFoundError(f"Kernel list not found: {kernel_list_path}")
+
+    kernels = []
+    with open(kernel_list_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            # Format: kernel_name|tile_config|trait_combo
+            parts = line.split("|")
+            kernel_name = parts[0]
+            kernels.append(KernelConfig.from_kernel_name(kernel_name, dtype, layout))
+
+    return kernels
+
+
+def build_kernel(build_dir: Path, kernel: KernelConfig) -> Tuple[str, bool, str]:
+    """
+    Build a single kernel benchmark executable.
+
+    Returns: (kernel_name, success, error_message)
+    """
+    target_name = f"benchmark_{kernel.name}"
+
+    try:
+        result = subprocess.run(
+            ["ninja", "-j1", target_name],
+            cwd=build_dir,
+            capture_output=True,
+            text=True,
+            timeout=300,  # 5 minute timeout
+        )
+
+        if result.returncode != 0:
+            return (kernel.name, False, result.stderr[:500])
+
+        return (kernel.name, True, "")
+    except subprocess.TimeoutExpired:
+        return (kernel.name, False, "Build timeout")
+    except Exception as e:
+        return (kernel.name, False, str(e))
+
+
+def run_benchmark(
+    build_dir: Path,
+    kernel: KernelConfig,
+    problem: ProblemSize,
+    warmup: int = 10,
+    repeat: int = 50,
+) -> BenchmarkResult:
+    """
+    Run benchmark for a single kernel and problem size.
+    """
+    exe_path = build_dir / "bin" / f"benchmark_{kernel.name}"
+
+    if not exe_path.exists():
+        return BenchmarkResult(
+            kernel_name=kernel.name,
+            m=problem.m,
+            n=problem.n,
+            k=problem.k,
+            avg_time_ms=0,
+            tflops=0,
+            is_valid=False,
+            error="Executable not found",
+        )
+
+    try:
+        result = subprocess.run(
+            [
+                str(exe_path),
+                f"-m={problem.m}",
+                f"-n={problem.n}",
+                f"-k={problem.k}",
+                f"-warmup={warmup}",
+                f"-repeat={repeat}",
+                "-verify=0",
+                "-json_output=true",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+
+        if result.returncode != 0:
+            # Try to parse error
+            error = result.stderr[:200] if result.stderr else result.stdout[:200]
+            return BenchmarkResult(
+                kernel_name=kernel.name,
+                m=problem.m,
+                n=problem.n,
+                k=problem.k,
+                avg_time_ms=0,
+                tflops=0,
+                is_valid=False,
+                error=error,
+            )
+
+        # Parse JSON output
+        output = result.stdout.strip()
+
+        # Try to find JSON in output
+        json_match = re.search(r"\{.*\}", output, re.DOTALL)
+        if json_match:
+            data = json.loads(json_match.group())
+            # Extract from nested perf_result object
+            perf = data.get("perf_result", {})
+            avg_time_ms = perf.get("latency(ms)", 0)
+            tflops = perf.get("tflops(TFlops)", 0)
+
+            return BenchmarkResult(
+                kernel_name=kernel.name,
+                m=problem.m,
+                n=problem.n,
+                k=problem.k,
+                avg_time_ms=avg_time_ms,
+                tflops=tflops,
+                is_valid=True,
+            )
+        else:
+            # Parse from text output
+            # Look for patterns like "avg_time: X ms" or "tflops: Y"
+            avg_time = 0.0
+            tflops = 0.0
+
+            time_match = re.search(
+                r"(?:avg[_\s]?time|latency)[:\s]+(\d+\.?\d*)\s*(?:ms)?", output, re.I
+            )
+            if time_match:
+                avg_time = float(time_match.group(1))
+
+            tflops_match = re.search(r"tflops[:\s]+(\d+\.?\d*)", output, re.I)
+            if tflops_match:
+                tflops = float(tflops_match.group(1))
+
+            # Calculate TFLOPs if not provided
+            if tflops == 0 and avg_time > 0:
+                flops = 2.0 * problem.m * problem.n * problem.k
+                tflops = flops / (avg_time * 1e-3) / 1e12
+
+            return BenchmarkResult(
+                kernel_name=kernel.name,
+                m=problem.m,
+                n=problem.n,
+                k=problem.k,
+                avg_time_ms=avg_time,
+                tflops=tflops,
+                is_valid=avg_time > 0,
+                error=None if avg_time > 0 else "Could not parse output",
+            )
+
+    except subprocess.TimeoutExpired:
+        return BenchmarkResult(
+            kernel_name=kernel.name,
+            m=problem.m,
+            n=problem.n,
+            k=problem.k,
+            avg_time_ms=0,
+            tflops=0,
+            is_valid=False,
+            error="Benchmark timeout",
+        )
+    except Exception as e:
+        return BenchmarkResult(
+            kernel_name=kernel.name,
+            m=problem.m,
+            n=problem.n,
+            k=problem.k,
+            avg_time_ms=0,
+            tflops=0,
+            is_valid=False,
+            error=str(e),
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate GEMM benchmark data for ML training"
+    )
+    parser.add_argument(
+        "--build_dir", type=str, default="/tmp/build", help="CK build directory"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="/tmp/benchmark_data",
+        help="Output directory for benchmark results",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="fp16",
+        choices=["fp16", "fp8", "bf16", "bf8"],
+        help="Data type to benchmark",
+    )
+    parser.add_argument(
+        "--layout",
+        type=str,
+        default="rcr",
+        choices=["rcr", "rrr", "crr", "ccr"],
+        help="Matrix layout to benchmark",
+    )
+    parser.add_argument(
+        "--num_build_jobs", type=int, default=4, help="Number of parallel build jobs"
+    )
+    parser.add_argument(
+        "--num_benchmark_jobs",
+        type=int,
+        default=1,
+        help="Number of parallel benchmark jobs (use 1 for accurate timing)",
+    )
+    parser.add_argument(
+        "--max_kernels",
+        type=int,
+        default=None,
+        help="Maximum number of kernels to benchmark (for testing)",
+    )
+    parser.add_argument(
+        "--skip_build",
+        action="store_true",
+        help="Skip building and only run benchmarks",
+    )
+    parser.add_argument(
+        "--warmup", type=int, default=10, help="Number of warmup iterations"
+    )
+    parser.add_argument(
+        "--repeat", type=int, default=50, help="Number of benchmark iterations"
+    )
+
+    args = parser.parse_args()
+
+    build_dir = Path(args.build_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load kernel configurations
+    print(f"Loading kernel list for {args.dtype}/{args.layout}...")
+    kernels = load_kernel_list(build_dir, args.dtype, args.layout)
+    print(f"Found {len(kernels)} kernel configurations")
+
+    if args.max_kernels:
+        kernels = kernels[: args.max_kernels]
+        print(f"Limiting to {len(kernels)} kernels")
+
+    # Build kernels
+    if not args.skip_build:
+        print(
+            f"\nBuilding {len(kernels)} kernels with {args.num_build_jobs} parallel jobs..."
+        )
+        build_results = {"success": 0, "failed": 0, "failed_kernels": []}
+
+        with ProcessPoolExecutor(max_workers=args.num_build_jobs) as executor:
+            futures = {executor.submit(build_kernel, build_dir, k): k for k in kernels}
+
+            for i, future in enumerate(as_completed(futures)):
+                kernel_name, success, error = future.result()
+                if success:
+                    build_results["success"] += 1
+                else:
+                    build_results["failed"] += 1
+                    build_results["failed_kernels"].append(
+                        {"name": kernel_name, "error": error}
+                    )
+
+                if (i + 1) % 10 == 0:
+                    print(
+                        f"  Built {i + 1}/{len(kernels)} ({build_results['success']} success, {build_results['failed']} failed)"
+                    )
+
+        print(
+            f"\nBuild complete: {build_results['success']} success, {build_results['failed']} failed"
+        )
+
+        # Save build results
+        with open(output_dir / "build_results.json", "w") as f:
+            json.dump(build_results, f, indent=2)
+
+    # Get problem sizes
+    problem_sizes = get_problem_sizes()
+    print(f"\nBenchmarking {len(problem_sizes)} problem sizes...")
+
+    # Run benchmarks
+    all_results = []
+    total_benchmarks = len(kernels) * len(problem_sizes)
+    completed = 0
+
+    print(f"Total benchmarks to run: {total_benchmarks}")
+
+    for kernel in kernels:
+        kernel_results = {
+            "kernel_config": asdict(kernel),
+            "benchmarks": [],
+        }
+
+        for problem in problem_sizes:
+            result = run_benchmark(
+                build_dir,
+                kernel,
+                problem,
+                warmup=args.warmup,
+                repeat=args.repeat,
+            )
+            kernel_results["benchmarks"].append(asdict(result))
+            completed += 1
+
+            if completed % 100 == 0:
+                print(f"  Progress: {completed}/{total_benchmarks} benchmarks complete")
+
+        all_results.append(kernel_results)
+
+        # Save intermediate results
+        intermediate_file = (
+            output_dir / f"benchmark_results_{args.dtype}_{args.layout}_partial.json"
+        )
+        with open(intermediate_file, "w") as f:
+            json.dump(all_results, f, indent=2)
+
+    # Save final results
+    final_file = output_dir / f"benchmark_results_{args.dtype}_{args.layout}.json"
+    with open(final_file, "w") as f:
+        json.dump(
+            {
+                "metadata": {
+                    "dtype": args.dtype,
+                    "layout": args.layout,
+                    "num_kernels": len(kernels),
+                    "num_problems": len(problem_sizes),
+                    "warmup": args.warmup,
+                    "repeat": args.repeat,
+                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+                },
+                "problem_sizes": [asdict(p) for p in problem_sizes],
+                "results": all_results,
+            },
+            f,
+            indent=2,
+        )
+
+    print(f"\nResults saved to {final_file}")
+
+    # Print summary
+    valid_count = sum(
+        1 for kr in all_results for br in kr["benchmarks"] if br["is_valid"]
+    )
+    print(f"Valid benchmarks: {valid_count}/{total_benchmarks}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/generate_edge_dims.py b/dispatcher/heuristics/generate_edge_dims.py
new file mode 100644
index 0000000000..f5d243a5a9
--- /dev/null
+++ b/dispatcher/heuristics/generate_edge_dims.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Supplementary edge-case benchmark generator for N=1 and K=1 dimensions.
+
+These shapes represent vector-matrix multiply (N=1), rank-1 updates (K=1),
+and other degenerate GEMM cases that stress tile efficiency and padding logic.
+"""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+def generate_edge_shapes():
+    """Generate shapes with N=1, K=1, and other single-dimension edge cases."""
+    shapes = set()
+
+    # --- N=1: vector-matrix multiply / single output column ---
+    for m in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
+        for k in [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 7168, 8192]:
+            shapes.add((m, 1, k))
+
+    # --- K=1: rank-1 update / outer product ---
+    for m in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
+        for n in [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 7168, 8192]:
+            shapes.add((m, n, 1))
+
+    # --- M=1, N=1: dot product ---
+    for k in [1, 16, 64, 256, 1024, 4096, 8192]:
+        shapes.add((1, 1, k))
+
+    # --- M=1, K=1: scalar-vector ---
+    for n in [1, 16, 64, 256, 1024, 4096, 8192]:
+        shapes.add((1, n, 1))
+
+    # --- N=1, K=1: scalar-vector ---
+    for m in [1, 16, 64, 256, 1024, 4096, 8192]:
+        shapes.add((m, 1, 1))
+
+    # --- All ones: 1x1x1 ---
+    shapes.add((1, 1, 1))
+
+    # --- Small N (2-16) ---
+    for m in [64, 256, 1024, 4096]:
+        for n in [2, 3, 4, 7, 8, 15, 16]:
+            for k in [64, 256, 1024, 4096]:
+                shapes.add((m, n, k))
+
+    # --- Small K (2-16) ---
+    for m in [64, 256, 1024, 4096]:
+        for n in [64, 256, 1024, 4096]:
+            for k in [2, 3, 4, 7, 8, 15, 16]:
+                shapes.add((m, n, k))
+
+    return sorted(shapes)
+
+
+def run_shapes(bin_dir, shapes, out_file, warmup=3, repeat=10):
+    """Run all kernels against shapes, writing streaming log."""
+    executables = sorted(Path(bin_dir).glob("benchmark_gemm_universal_fp8_rcr_*"))
+    if not executables:
+        print(f"ERROR: No executables found in {bin_dir}", file=sys.stderr)
+        return 0
+
+    total = 0
+    for idx, (m, n, k) in enumerate(shapes):
+        out_file.write("\n========================================\n")
+        out_file.write(f"Shape {idx + 1}: M={m} N={n} K={k} dtype=fp8 layout=rcr\n")
+        out_file.write("========================================\n")
+        out_file.write(f"Found {len(executables)} kernels\n")
+        out_file.flush()
+
+        for exe in executables:
+            try:
+                result = subprocess.run(
+                    [
+                        str(exe),
+                        f"-m={m}",
+                        f"-n={n}",
+                        f"-k={k}",
+                        f"-warmup={warmup}",
+                        f"-repeat={repeat}",
+                        "-verify=0",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=60,
+                )
+                output = result.stdout
+                json_start = output.find("{")
+                json_end = output.rfind("}") + 1
+                if json_start >= 0 and json_end > json_start:
+                    json_block = output[json_start:json_end]
+                    try:
+                        json.loads(json_block)
+                        out_file.write(json_block + "\n")
+                        total += 1
+                    except json.JSONDecodeError:
+                        pass
+            except (subprocess.TimeoutExpired, Exception):
+                pass
+
+        out_file.flush()
+        print(
+            f"  Shape {idx + 1}/{len(shapes)}: M={m} N={n} K={k}",
+            file=sys.stderr,
+            flush=True,
+        )
+
+    return total
+
+
+if __name__ == "__main__":
+    bin_dir = "/workspace/ck_tile/bin"
+    out_dir = Path("data/edge_dims")
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    shapes = generate_edge_shapes()
+    print(f"Generated {len(shapes)} edge-case shapes", file=sys.stderr, flush=True)
+
+    n1_count = sum(1 for m, n, k in shapes if n == 1)
+    k1_count = sum(1 for m, n, k in shapes if k == 1)
+    both1 = sum(1 for m, n, k in shapes if n == 1 and k == 1)
+    small_n = sum(1 for m, n, k in shapes if 2 <= n <= 16)
+    small_k = sum(1 for m, n, k in shapes if 2 <= k <= 16)
+    print(
+        f"  N=1: {n1_count}, K=1: {k1_count}, both=1: {both1}",
+        file=sys.stderr,
+        flush=True,
+    )
+    print(
+        f"  Small N(2-16): {small_n}, Small K(2-16): {small_k}",
+        file=sys.stderr,
+        flush=True,
+    )
+
+    batch_size = 25
+    total = 0
+    batch_idx = 0
+    for i in range(0, len(shapes), batch_size):
+        batch = shapes[i : i + batch_size]
+        batch_idx += 1
+        out_path = out_dir / f"edge_dims_batch_{batch_idx:03d}.log"
+        print(
+            f"\nBatch {batch_idx}: shapes {i + 1}-{i + len(batch)} -> {out_path}",
+            file=sys.stderr,
+            flush=True,
+        )
+
+        with open(out_path, "w") as f:
+            f.write(f"CK Tile Edge Dims Benchmark Batch {batch_idx}\n")
+            f.write("GPU ID: 0\nImplementation: gemm_universal\n\n")
+            count = run_shapes(bin_dir, batch, f, warmup=3, repeat=10)
+            total += count
+
+        print(f"  Batch {batch_idx} done: {count} results", file=sys.stderr, flush=True)
+
+    print(
+        f"\nTotal: {total} benchmarks across {len(shapes)} shapes",
+        file=sys.stderr,
+        flush=True,
+    )
diff --git a/dispatcher/heuristics/generate_wide_coverage.py b/dispatcher/heuristics/generate_wide_coverage.py
new file mode 100644
index 0000000000..e8e8116946
--- /dev/null
+++ b/dispatcher/heuristics/generate_wide_coverage.py
@@ -0,0 +1,289 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Wide-coverage benchmark data generator.
+
+Generates benchmark results for hundreds of diverse GEMM shapes across all
+corner cases: skinny M, tall N, deep K, M=1, prime dimensions, power-of-2,
+LLM inference shapes, training shapes, and edge cases.
+
+Runs all 4608 kernels in /workspace/ck_tile/bin/ against each shape and
+writes streaming log output parseable by data_pipeline.py.
+
+Usage:
+    python3 generate_wide_coverage.py --bin_dir /workspace/ck_tile/bin \
+        --out_dir data/ --batch_size 20 --warmup 3 --repeat 10
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+def generate_shape_list():
+    """Generate a comprehensive list of (M, N, K) shapes covering all corner cases.
+
+    Categories:
+      1. M=1 (single token inference) -- the hardest case
+      2. Tiny M (2-16) -- small batch inference
+      3. Small M (32-128) -- medium batch
+      4. Medium M (256-2048) -- large batch / training
+      5. Large M (4096-20480) -- very large batch
+      6. Square shapes (powers of 2)
+      7. Skinny M, tall N (M << N)
+      8. Tall M, skinny N (M >> N)
+      9. Deep K (K >> M, N) -- compute-bound
+     10. Shallow K (K << M, N) -- memory-bound
+     11. Prime dimensions -- worst-case for tiling
+     12. LLM-specific shapes (DeepSeek, LLaMA, etc.)
+     13. Non-power-of-2 common sizes
+    """
+    shapes = set()
+
+    # --- 1. M=1 (single token) across various N, K ---
+    for n in [512, 1024, 1536, 2048, 3072, 4096, 4608, 7168, 8192, 11008, 14336, 28672]:
+        for k in [256, 512, 1024, 1536, 2048, 2304, 4096, 7168, 8192]:
+            shapes.add((1, n, k))
+
+    # --- 2. Tiny M (2-16) ---
+    for m in [2, 4, 8, 16]:
+        for n in [512, 1536, 4096, 7168]:
+            for k in [256, 1024, 4096, 7168]:
+                shapes.add((m, n, k))
+
+    # --- 3. Small M (32-128) ---
+    for m in [32, 48, 64, 96, 128]:
+        for n in [512, 1536, 4096, 7168, 8192]:
+            for k in [256, 512, 2048, 4096, 7168]:
+                shapes.add((m, n, k))
+
+    # --- 4. Medium M (256-2048) ---
+    for m in [256, 384, 512, 768, 1024, 1536, 2048]:
+        for n in [512, 1536, 4096, 7168]:
+            for k in [256, 1024, 2048, 4096, 7168]:
+                shapes.add((m, n, k))
+
+    # --- 5. Large M (4096-20480) ---
+    for m in [4096, 8192, 12288, 16384, 20480]:
+        for n in [512, 1536, 4096, 7168]:
+            for k in [256, 1024, 2048, 7168]:
+                shapes.add((m, n, k))
+
+    # --- 6. Square shapes (powers of 2) ---
+    for p in range(5, 14):  # 32 to 8192
+        d = 2**p
+        shapes.add((d, d, d))
+
+    # --- 7. Skinny M, tall N ---
+    for m in [1, 4, 16, 64]:
+        for n in [8192, 16384, 28672]:
+            for k in [1024, 4096, 8192]:
+                shapes.add((m, n, k))
+
+    # --- 8. Tall M, skinny N ---
+    for m in [4096, 8192, 16384]:
+        for n in [32, 64, 128, 256]:
+            for k in [1024, 4096]:
+                shapes.add((m, n, k))
+
+    # --- 9. Deep K (K >> M, N) ---
+    for m in [16, 64, 256]:
+        for n in [16, 64, 256]:
+            for k in [4096, 8192, 16384, 32768]:
+                shapes.add((m, n, k))
+
+    # --- 10. Shallow K (K << M, N) ---
+    for m in [1024, 4096, 8192]:
+        for n in [1024, 4096, 8192]:
+            for k in [16, 32, 64, 128]:
+                shapes.add((m, n, k))
+
+    # --- 11. Prime dimensions ---
+    primes = [17, 31, 37, 127, 251, 509, 1021, 2039, 4093]
+    for p in primes:
+        shapes.add((p, p, p))
+    for p in primes[:5]:
+        shapes.add((p, 4096, 4096))
+        shapes.add((4096, p, 4096))
+        shapes.add((4096, 4096, p))
+
+    # --- 12. LLM-specific shapes ---
+    llm_shapes = [
+        # DeepSeek MoE
+        (1, 1536, 7168),
+        (1, 4608, 7168),
+        (1, 7168, 2048),
+        (1, 7168, 2304),
+        (1, 7168, 256),
+        (1, 576, 7168),
+        (1, 512, 7168),
+        (1, 3072, 1536),
+        # LLaMA-7B
+        (1, 4096, 4096),
+        (32, 4096, 4096),
+        (128, 4096, 4096),
+        (1, 4096, 11008),
+        (32, 4096, 11008),
+        (1, 11008, 4096),
+        (32, 11008, 4096),
+        # LLaMA-70B
+        (1, 8192, 8192),
+        (32, 8192, 8192),
+        (128, 8192, 8192),
+        (1, 8192, 28672),
+        (32, 8192, 28672),
+        (1, 28672, 8192),
+        # GPT-style attention
+        (128, 128, 64),
+        (128, 128, 128),
+        (256, 256, 64),
+        (512, 512, 64),
+        (1024, 1024, 64),
+        (2048, 2048, 64),
+    ]
+    for s in llm_shapes:
+        shapes.add(s)
+
+    # --- 13. Non-power-of-2 common sizes ---
+    for m in [48, 96, 192, 384, 576, 768, 1152, 1536, 2304, 3072, 4608, 6144]:
+        shapes.add((m, m, m))
+        shapes.add((m, 4096, 4096))
+
+    sorted_shapes = sorted(shapes)
+    return sorted_shapes
+
+
+def run_shape_batch(bin_dir, shapes, out_file, warmup=3, repeat=10):
+    """Run all kernels against a batch of shapes, writing streaming log output."""
+    executables = sorted(Path(bin_dir).glob("benchmark_gemm_universal_fp8_rcr_*"))
+    if not executables:
+        print(f"ERROR: No executables found in {bin_dir}", file=sys.stderr)
+        return 0
+
+    total_benchmarks = 0
+
+    for shape_idx, (m, n, k) in enumerate(shapes):
+        out_file.write("\n========================================\n")
+        out_file.write(
+            f"Shape {shape_idx + 1}: M={m} N={n} K={k} dtype=fp8 layout=rcr\n"
+        )
+        out_file.write("========================================\n")
+        out_file.write(f"Found {len(executables)} kernels\n")
+        out_file.flush()
+
+        for exe in executables:
+            try:
+                result = subprocess.run(
+                    [
+                        str(exe),
+                        f"-m={m}",
+                        f"-n={n}",
+                        f"-k={k}",
+                        f"-warmup={warmup}",
+                        f"-repeat={repeat}",
+                        "-verify=0",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=60,
+                )
+                output = result.stdout
+                # Extract JSON block from output
+                json_start = output.find("{")
+                json_end = output.rfind("}") + 1
+                if json_start >= 0 and json_end > json_start:
+                    json_block = output[json_start:json_end]
+                    try:
+                        json.loads(json_block)
+                        out_file.write(json_block + "\n")
+                        total_benchmarks += 1
+                    except json.JSONDecodeError:
+                        pass
+            except (subprocess.TimeoutExpired, Exception):
+                pass
+
+        out_file.flush()
+        elapsed_kernels = len(executables)
+        print(
+            f"  Shape {shape_idx + 1}/{len(shapes)}: M={m} N={n} K={k} "
+            f"({elapsed_kernels} kernels)",
+            file=sys.stderr,
+            flush=True,
+        )
+
+    return total_benchmarks
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate wide-coverage benchmark data"
+    )
+    parser.add_argument(
+        "--bin_dir",
+        default="/workspace/ck_tile/bin",
+        help="Directory with benchmark executables",
+    )
+    parser.add_argument("--out_dir", default="data", help="Output directory")
+    parser.add_argument(
+        "--batch_size", type=int, default=25, help="Shapes per output file"
+    )
+    parser.add_argument("--warmup", type=int, default=3)
+    parser.add_argument("--repeat", type=int, default=10)
+    parser.add_argument(
+        "--max_shapes", type=int, default=None, help="Limit total shapes (for testing)"
+    )
+    args = parser.parse_args()
+
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    shapes = generate_shape_list()
+    if args.max_shapes:
+        shapes = shapes[: args.max_shapes]
+
+    print(f"Generated {len(shapes)} unique shapes", file=sys.stderr, flush=True)
+    print(f"Bin dir: {args.bin_dir}", file=sys.stderr, flush=True)
+    print(f"Output dir: {args.out_dir}", file=sys.stderr, flush=True)
+    print(f"Batch size: {args.batch_size}", file=sys.stderr, flush=True)
+
+    total = 0
+    batch_idx = 0
+    for i in range(0, len(shapes), args.batch_size):
+        batch = shapes[i : i + args.batch_size]
+        batch_idx += 1
+        out_path = out_dir / f"wide_coverage_batch_{batch_idx:03d}.log"
+
+        print(
+            f"\nBatch {batch_idx}: shapes {i + 1}-{i + len(batch)} -> {out_path}",
+            file=sys.stderr,
+            flush=True,
+        )
+
+        with open(out_path, "w") as f:
+            f.write(f"CK Tile Wide Coverage Benchmark Batch {batch_idx}\n")
+            f.write("GPU ID: 0\n")
+            f.write("Implementation: gemm_universal\n\n")
+            count = run_shape_batch(
+                args.bin_dir, batch, f, warmup=args.warmup, repeat=args.repeat
+            )
+            total += count
+
+        print(
+            f"  Batch {batch_idx} complete: {count} benchmarks",
+            file=sys.stderr,
+            flush=True,
+        )
+
+    print(
+        f"\nTotal: {total} benchmarks across {len(shapes)} shapes",
+        file=sys.stderr,
+        flush=True,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/ml_heuristic_sweep.py b/dispatcher/heuristics/ml_heuristic_sweep.py
new file mode 100644
index 0000000000..7190a19678
--- /dev/null
+++ b/dispatcher/heuristics/ml_heuristic_sweep.py
@@ -0,0 +1,867 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+ML Heuristic Sweep: Comprehensive GEMM Performance Evaluation
+
+Sweeps across diverse problem shapes with ML-based kernel selection to measure
+TFLOPS performance. Supports multiple dtypes (fp16, bf16, fp8) and validates
+ML model predictions by executing kernels on GPU.
+
+Shape Constraints (fp16/bf16 on gfx950):
+- M >= 1 (any M is valid)
+- N % 8 == 0 AND N >= 64
+- K % 2 == 0 AND K >= 32
+
+Usage:
+    python ml_heuristic_sweep.py --dtype fp16 --num_shapes 256
+    python ml_heuristic_sweep.py --dtypes fp16 bf16 --output sweep_results.csv
+    python ml_heuristic_sweep.py --dtype fp16 --dry_run  # Prediction only, no GPU execution
+"""
+
+import sys
+import argparse
+import time
+import csv
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Tuple
+
+# Add parent directories to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent / "python"))
+
+import numpy as np
+
+from ctypes_utils import (
+    KernelConfig,
+    setup_gemm_dispatcher,
+    cleanup_gemm,
+)
+
+try:
+    from predict import Predictor
+    # from feature_engine import GemmUniversalFeatureEngine
+
+    HAS_ML = True
+except ImportError:
+    HAS_ML = False
+    print("WARNING: ML heuristic modules not available. Will use first-fit selection.")
+
+
+@dataclass
+class KernelSpec:
+    """Kernel specification for ML heuristic"""
+
+    name: str
+    tile_m: int
+    tile_n: int
+    tile_k: int
+    pipeline: str = "compv3"
+    scheduler: str = "intrawave"
+    wave_m: int = 2
+    wave_n: int = 2
+    wave_k: int = 1
+    warp_m: int = 32
+    warp_n: int = 32
+    warp_k: int = 16
+
+
+# Comprehensive kernel pool covering diverse tile sizes and configurations
+KERNEL_POOL = [
+    # Small tiles (64x64)
+    KernelSpec(
+        "s_64x64_k32_v3", 64, 64, 32, "compv3", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    KernelSpec(
+        "s_64x64_k64_v3", 64, 64, 64, "compv3", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    KernelSpec(
+        "s_64x64_k128_v3", 64, 64, 128, "compv3", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    KernelSpec(
+        "s_64x64_k64_v4", 64, 64, 64, "compv4", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    KernelSpec("s_64x64_k64_mem", 64, 64, 64, "mem", "intrawave", 2, 2, 1, 16, 16, 16),
+    KernelSpec(
+        "s_64x64_k128_mem", 64, 64, 128, "mem", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    # Medium tiles (128x128)
+    KernelSpec("m_128x128_k32_v3", 128, 128, 32, "compv3", "intrawave"),
+    KernelSpec("m_128x128_k64_v3", 128, 128, 64, "compv3", "intrawave"),
+    KernelSpec("m_128x128_k128_v3", 128, 128, 128, "compv3", "intrawave"),
+    KernelSpec("m_128x128_k64_v4", 128, 128, 64, "compv4", "intrawave"),
+    KernelSpec("m_128x128_k128_v4", 128, 128, 128, "compv4", "intrawave"),
+    KernelSpec("m_128x128_k64_mem", 128, 128, 64, "mem", "intrawave"),
+    KernelSpec("m_128x128_k128_mem", 128, 128, 128, "mem", "intrawave"),
+    # Rectangular medium (M != N)
+    KernelSpec(
+        "r_64x128_k32_v3", 64, 128, 32, "compv3", "intrawave", 2, 2, 1, 16, 32, 16
+    ),
+    KernelSpec(
+        "r_128x64_k32_v3", 128, 64, 32, "compv3", "intrawave", 2, 2, 1, 32, 16, 16
+    ),
+    KernelSpec(
+        "r_64x128_k64_v3", 64, 128, 64, "compv3", "intrawave", 2, 2, 1, 16, 32, 16
+    ),
+    KernelSpec(
+        "r_128x64_k64_v3", 128, 64, 64, "compv3", "intrawave", 2, 2, 1, 32, 16, 16
+    ),
+    KernelSpec(
+        "r_64x256_k32_v3", 64, 256, 32, "compv3", "intrawave", 2, 2, 1, 16, 32, 16
+    ),
+    KernelSpec(
+        "r_256x64_k32_v3", 256, 64, 32, "compv3", "intrawave", 2, 2, 1, 32, 16, 16
+    ),
+    # Large tiles (256x256)
+    KernelSpec("l_256x128_k32_v3", 256, 128, 32, "compv3", "intrawave"),
+    KernelSpec("l_128x256_k32_v3", 128, 256, 32, "compv3", "intrawave"),
+    KernelSpec("l_256x256_k32_v3", 256, 256, 32, "compv3", "intrawave"),
+    KernelSpec("l_256x256_k64_v3", 256, 256, 64, "compv3", "intrawave"),
+    KernelSpec("l_256x256_k64_v4", 256, 256, 64, "compv4", "intrawave"),
+    # Interwave variants
+    KernelSpec("m_128x128_k64_iw_v3", 128, 128, 64, "compv3", "interwave"),
+    KernelSpec("m_128x128_k128_iw_v3", 128, 128, 128, "compv3", "interwave"),
+    KernelSpec("l_256x256_k32_iw_v3", 256, 256, 32, "compv3", "interwave"),
+]
+
+
+def generate_problem_shapes(num_shapes: int = 1024) -> List[Tuple[int, int, int]]:
+    """
+    Generate diverse problem shapes with hardware constraints:
+    - M >= 1 (any M is valid, including tiny M for inference)
+    - N % 8 == 0 AND N >= 64 (hardware alignment requirement)
+    - K % 2 == 0 AND K >= 32 (fp16 requirement)
+
+    Covers:
+    - Powers of 2 (square and rectangular)
+    - ML workloads (LLM attention, MLP, batch inference)
+    - Non-power-of-2 dimensions (aligned to constraints)
+    - Edge cases (tiny M, very large matrices, extreme aspect ratios)
+    """
+    shapes = []
+
+    # 1. Powers of 2 - Square (64 to 8192) with K variations
+    for p in range(6, 14):  # 2^6=64 to 2^13=8192
+        dim = 2**p
+        shapes.append((dim, dim, dim))
+        if dim >= 128:
+            # K variations (must be even and >= 32)
+            shapes.append((dim, dim, dim // 2))
+            shapes.append((dim, dim, dim * 2))
+            shapes.append((dim, dim, max(32, dim // 4)))
+
+    # 2. Small batch inference (1-256 batch, common hidden dims)
+    # N must be multiple of 8 and >= 64
+    hidden_dims = [768, 1024, 2048, 3072, 4096, 5120, 8192, 11008, 12288, 16384]
+    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
+
+    for hidden in hidden_dims:
+        for batch in batch_sizes[:8]:
+            shapes.append((batch, hidden, hidden))
+            if hidden >= 4096:
+                # LLM MLP projections (ensure K is even)
+                k_mlp = hidden * 3 // 4
+                if k_mlp % 2 == 1:
+                    k_mlp += 1  # Make even
+                if k_mlp >= 32:
+                    shapes.append((batch, hidden, k_mlp))
+                    shapes.append((batch, k_mlp, hidden))
+
+    # 3. Attention patterns (seq_len x head_dim)
+    # seq_len can be any value >= 1, total_dim must be multiple of 8
+    seq_lens = [128, 256, 512, 1024, 2048, 4096, 8192]
+    head_dims = [64, 80, 96, 128, 256]
+    num_heads = [8, 12, 16, 32, 40, 64]
+
+    for seq in seq_lens:
+        for head_dim in head_dims:
+            for nh in num_heads[:4]:
+                total_dim = nh * head_dim
+                # total_dim should be multiple of 8 (naturally satisfied for most cases)
+                if total_dim % 8 == 0 and total_dim >= 64:
+                    # head_dim must be even for K
+                    if head_dim % 2 == 0 and head_dim >= 32:
+                        shapes.append((seq, total_dim, head_dim))
+                        shapes.append((seq, head_dim, total_dim))
+
+    # 4. Rectangular matrices (extreme aspect ratios)
+    # All dims must satisfy constraints
+    dims_m = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
+    dims_n = [64, 128, 256, 512, 1024, 2048, 4096, 8192]  # N >= 64, N % 8 == 0
+    dims_k = [
+        32,
+        64,
+        128,
+        256,
+        512,
+        1024,
+        2048,
+        4096,
+        8192,
+        16384,
+    ]  # K >= 32, K % 2 == 0
+
+    # Sample to avoid explosion
+    for i, m in enumerate(dims_m):
+        for j, n in enumerate(dims_n):
+            for _l, k in enumerate(dims_k):
+                if (i + j + _l) % 3 == 0:  # Stratified sampling
+                    shapes.append((m, n, k))
+
+    # 5. Non-power-of-2 dimensions (aligned to constraints)
+    # N values: multiples of 8, >= 64
+    non_pow2_n = [
+        72,
+        80,
+        88,
+        96,
+        104,
+        112,
+        120,
+        136,
+        144,
+        152,
+        160,
+        176,
+        184,
+        192,
+        200,
+        224,
+        240,
+        272,
+        288,
+        304,
+        320,
+        336,
+        352,
+        368,
+        384,
+        400,
+        416,
+        448,
+        480,
+        544,
+        576,
+        640,
+        672,
+        704,
+        736,
+        768,
+        800,
+        832,
+        896,
+        960,
+        1088,
+        1152,
+        1216,
+        1280,
+        1344,
+        1408,
+        1472,
+        1536,
+        1600,
+        1664,
+        1728,
+        1792,
+        1856,
+        1920,
+        2176,
+        2304,
+        2432,
+        2560,
+        2688,
+        2816,
+        2944,
+        3072,
+        3200,
+        3328,
+        3456,
+        3584,
+        3712,
+        3840,
+        3968,
+        4224,
+        4352,
+        4480,
+        4608,
+        4736,
+        4864,
+        4992,
+    ]
+
+    # K values: even numbers >= 32
+    non_pow2_k = [
+        34,
+        36,
+        38,
+        40,
+        42,
+        44,
+        48,
+        50,
+        52,
+        56,
+        60,
+        66,
+        68,
+        72,
+        76,
+        80,
+        88,
+        96,
+        100,
+        112,
+        120,
+        136,
+        144,
+        160,
+        176,
+        192,
+        224,
+        240,
+        272,
+        288,
+        320,
+        352,
+        384,
+        416,
+        448,
+        480,
+        544,
+        576,
+        640,
+        672,
+        704,
+        768,
+        800,
+        832,
+        896,
+        960,
+        1088,
+        1152,
+        1280,
+        1344,
+        1408,
+        1536,
+        1600,
+        1664,
+        1792,
+        1920,
+    ]
+
+    # M values: any value >= 1
+    non_pow2_m = [
+        1,
+        3,
+        5,
+        7,
+        9,
+        11,
+        13,
+        15,
+        17,
+        19,
+        23,
+        27,
+        31,
+        33,
+        37,
+        41,
+        47,
+        51,
+        57,
+        63,
+        65,
+        71,
+        79,
+        87,
+        95,
+        97,
+        111,
+        119,
+        127,
+        129,
+        143,
+        159,
+        175,
+        191,
+        193,
+        223,
+        239,
+        255,
+        257,
+        287,
+        319,
+        351,
+        383,
+        385,
+        447,
+        479,
+        511,
+        513,
+        575,
+        639,
+        703,
+        767,
+        769,
+        895,
+        959,
+        1023,
+        1025,
+    ]
+
+    # Sample non-power-of-2 shapes
+    for i, m in enumerate(non_pow2_m[:30]):
+        for j, n in enumerate(non_pow2_n[:20]):
+            for _l, k in enumerate(non_pow2_k[:15]):
+                if (i + j + _l) % 4 == 0:  # Stratified sampling
+                    shapes.append((m, n, k))
+
+    # 6. Very tall K (memory-bound) - ensure N % 8 == 0, K % 2 == 0
+    for mn in [64, 128, 256, 512, 1024]:
+        for k in [4096, 8192, 16384]:
+            shapes.append((mn, mn, k))
+
+    # 7. Very short K (compute-bound) - ensure K >= 32, K % 2 == 0
+    for mn in [512, 1024, 2048, 4096]:
+        for k in [32, 64, 128]:
+            shapes.append((mn, mn, k))
+
+    # 8. Tiny M (edge cases for batch-1 inference)
+    for m in [1, 2, 4, 8, 16, 32]:
+        for n in [64, 128, 256, 512, 1024, 2048]:  # N >= 64, N % 8 == 0
+            for k in [32, 64, 128, 256, 512]:  # K >= 32, K % 2 == 0
+                shapes.append((m, n, k))
+
+    # 9. Stress test sizes (aligned to constraints)
+    stress_sizes = [
+        (10000, 10000, 10000),
+        (1000, 10000, 1000),
+        (1000, 1000, 10000),
+        (5000, 5000, 5000),
+        (7168, 7168, 7168),  # Common LLM hidden dim
+        (8192, 11008, 8192),  # LLaMA MLP dimensions
+    ]
+    shapes.extend(stress_sizes)
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_shapes = []
+    for s in shapes:
+        if s not in seen:
+            seen.add(s)
+            unique_shapes.append(s)
+
+    # Filter to ensure all shapes meet constraints
+    valid_shapes = []
+    for m, n, k in unique_shapes:
+        if m >= 1 and n >= 64 and n % 8 == 0 and k >= 32 and k % 2 == 0:
+            valid_shapes.append((m, n, k))
+
+    # Sample down to target number if we have too many
+    if len(valid_shapes) > num_shapes:
+        # Stratified sampling to preserve diversity
+        step = len(valid_shapes) / num_shapes
+        valid_shapes = [valid_shapes[int(i * step)] for i in range(num_shapes)]
+
+    return valid_shapes
+
+
+def spec_to_feature_dict(spec: KernelSpec, dtype: str, layout: str) -> dict:
+    """Convert KernelSpec to feature dict for ML predictor"""
+    return {
+        "kernel_name": spec.name,
+        "tile_m": spec.tile_m,
+        "tile_n": spec.tile_n,
+        "tile_k": spec.tile_k,
+        "warp_m": spec.wave_m,
+        "warp_n": spec.wave_n,
+        "warp_k": spec.wave_k,
+        "warp_tile_m": spec.warp_m,
+        "warp_tile_n": spec.warp_n,
+        "warp_tile_k": spec.warp_k,
+        "pipeline": spec.pipeline,
+        "scheduler": spec.scheduler,
+        "epilogue": "cshuffle",
+        "pad_m": True,  # Enable padding to support arbitrary M dimensions
+        "pad_n": True,  # Enable padding to support arbitrary N dimensions
+        "pad_k": True,  # Enable padding to support arbitrary K dimensions
+        "persistent": False,
+        "dtype": dtype,
+        "layout": layout,
+    }
+
+
+def spec_to_kernel_config(
+    spec: KernelSpec, dtype: str, arch: str, dtype_acc: str = "fp32"
+) -> KernelConfig:
+    """Convert KernelSpec to KernelConfig for dispatcher"""
+    return KernelConfig(
+        dtype_a=dtype,
+        dtype_b=dtype,
+        dtype_c=dtype,
+        dtype_acc=dtype_acc,
+        layout_a="row",
+        layout_b="col",
+        layout_c="row",
+        tile_m=spec.tile_m,
+        tile_n=spec.tile_n,
+        tile_k=spec.tile_k,
+        wave_m=spec.wave_m,
+        wave_n=spec.wave_n,
+        wave_k=spec.wave_k,
+        warp_m=spec.warp_m,
+        warp_n=spec.warp_n,
+        warp_k=spec.warp_k,
+        pipeline=spec.pipeline,
+        scheduler=spec.scheduler,
+        epilogue="cshuffle",
+        gfx_arch=arch,
+    )
+
+
+def ml_select_kernel(
+    predictor, pool: List[KernelSpec], M: int, N: int, K: int, dtype: str, layout: str
+) -> Tuple[KernelSpec, float]:
+    """Use ML model to select best kernel"""
+    if not HAS_ML or predictor is None:
+        # Fallback: select first kernel
+        return pool[0], 0.0
+
+    problem = {"m": M, "n": N, "k": K, "dtype": dtype, "layout": layout, "split_k": 1}
+    kernel_dicts = [spec_to_feature_dict(s, dtype, layout) for s in pool]
+
+    ranked = predictor.rank_kernels(problem, kernel_dicts)
+    if not ranked:
+        return pool[0], 0.0
+
+    best_name, best_tflops = ranked[0]
+    best_spec = next((s for s in pool if s.name == best_name), pool[0])
+    return best_spec, best_tflops
+
+
+def run_single_gemm(
+    M: int,
+    N: int,
+    K: int,
+    dtype: str,
+    arch: str,
+    predictor,
+    dry_run: bool = False,
+    dtype_acc: str = "fp32",
+) -> dict:
+    """Run a single GEMM with ML heuristic selection"""
+
+    # Select kernel via ML heuristic
+    t0 = time.time()
+    best_spec, pred_tflops = ml_select_kernel(
+        predictor, KERNEL_POOL, M, N, K, dtype, "rcr"
+    )
+    select_time_ms = (time.time() - t0) * 1000
+
+    result = {
+        "M": M,
+        "N": N,
+        "K": K,
+        "dtype": dtype,
+        "selected_kernel": best_spec.name,
+        "predicted_tflops": pred_tflops,
+        "selection_time_ms": select_time_ms,
+        "actual_time_ms": 0,
+        "actual_tflops": 0,
+        "status": "SKIP" if dry_run else "PENDING",
+        "error": None,
+    }
+
+    if dry_run:
+        return result
+
+    # Build and run kernel
+    config = spec_to_kernel_config(best_spec, dtype, arch, dtype_acc)
+
+    try:
+        setup = setup_gemm_dispatcher(
+            config=config,
+            registry_name=f"sweep_{dtype}_{best_spec.name}",
+            verbose=False,
+            auto_rebuild=True,
+        )
+
+        if not setup.success:
+            result["status"] = "BUILD_FAIL"
+            result["error"] = "Failed to build kernel"
+            cleanup_gemm()
+            return result
+
+        dispatcher = setup.dispatcher
+        if not dispatcher.is_supported(M, N, K):
+            result["status"] = "UNSUPPORTED"
+            result["error"] = "Problem size not supported by kernel"
+            cleanup_gemm()
+            return result
+
+        # Create input data
+        np_dtype = {"fp16": np.float16, "bf16": np.float16, "fp8": np.float16}[dtype]
+        np.random.seed(42)
+        A = (np.random.randn(M, K) * 0.1).astype(np_dtype)
+        B = (np.random.randn(K, N) * 0.1).astype(np_dtype)
+
+        # Run GEMM
+        exec_result = dispatcher.run(A, B, M, N, K)
+
+        if exec_result.success:
+            result["actual_time_ms"] = exec_result.time_ms
+            result["actual_tflops"] = exec_result.tflops
+            result["status"] = "SUCCESS"
+        else:
+            # Decode status code for better error message
+            status_messages = {
+                0: "Success",
+                -1: "GPU/HIP error (check permissions, memory, or kernel validity)",
+                -2: "No suitable kernel found for this problem size",
+            }
+            error_msg = status_messages.get(exec_result.status, f"Unknown error (status={exec_result.status})")
+            result["status"] = "RUN_FAIL"
+            result["error"] = f"{error_msg} (status_code={exec_result.status})"
+
+            # Print detailed error for debugging
+            print(f"  ERROR: {error_msg}")
+            print(f"  Status code: {exec_result.status}")
+            print(f"  Time returned: {exec_result.time_ms}")
+            print(f"  Kernel: {exec_result.kernel_name}")
+
+        cleanup_gemm()
+
+    except Exception as e:
+        result["status"] = "ERROR"
+        result["error"] = str(e)[:200]
+        cleanup_gemm()
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="ML Heuristic Sweep: Test GEMM across many shapes and dtypes"
+    )
+    parser.add_argument(
+        "--dtypes",
+        nargs="+",
+        default=["fp16"],
+        choices=["fp16", "bf16", "fp8"],
+        help="Data types to test (default: fp16)",
+    )
+    parser.add_argument(
+        "--arch", default="gfx950", help="GPU architecture (default: gfx950)"
+    )
+    parser.add_argument(
+        "--dtype_acc",
+        default="fp32",
+        choices=["fp16", "fp32"],
+        help="Accumulator data type (default: fp32)",
+    )
+    parser.add_argument(
+        "--model_dir",
+        default=None,
+        help="Path to model directory (auto-detect if not specified)",
+    )
+    parser.add_argument(
+        "--num_shapes",
+        type=int,
+        default=256,
+        help="Number of problem shapes to test (default: 256)",
+    )
+    parser.add_argument(
+        "--output",
+        default="ml_heuristic_sweep_results.csv",
+        help="Output CSV file path",
+    )
+    parser.add_argument(
+        "--dry_run",
+        action="store_true",
+        help="Only predict, do not run kernels (fast validation)",
+    )
+
+    args = parser.parse_args()
+
+    # Setup ML predictor
+    predictor = None
+    if HAS_ML:
+        if args.model_dir is None:
+            # Auto-detect model directory based on first dtype
+            first_dtype = args.dtypes[0]
+            heuristics_dir = Path(__file__).parent
+            model_candidates = [
+                heuristics_dir / "models" / f"gemm_universal_{first_dtype}_{args.arch}",
+            ]
+            for model_dir in model_candidates:
+                if model_dir.exists():
+                    args.model_dir = str(model_dir)
+                    break
+
+        if args.model_dir and Path(args.model_dir).exists():
+            try:
+                predictor = Predictor(args.model_dir)
+                print(f"✓ Loaded ML model from: {args.model_dir}")
+            except Exception as e:
+                print(f"⚠ Failed to load ML model: {e}")
+                print("  Will use first-fit selection instead")
+        else:
+            print(f"⚠ Model directory not found: {args.model_dir}")
+            print("  Will use first-fit selection instead")
+
+    # Generate problem shapes
+    print(f"\nGenerating {args.num_shapes} problem shapes...")
+    shapes = generate_problem_shapes(args.num_shapes)
+    print(
+        f"✓ Generated {len(shapes)} valid shapes (M>=1, N%8==0, N>=64, K%2==0, K>=32)"
+    )
+
+    # Validate all shapes meet constraints
+    invalid = [
+        (m, n, k)
+        for m, n, k in shapes
+        if not (m >= 1 and n >= 64 and n % 8 == 0 and k >= 32 and k % 2 == 0)
+    ]
+    if invalid:
+        print(f"⚠ WARNING: {len(invalid)} shapes violate constraints!")
+        print(f"  First few: {invalid[:5]}")
+
+    # Print configuration
+    print("\n" + "=" * 80)
+    print("  ML Heuristic Sweep Configuration")
+    print("=" * 80)
+    print(
+        f"  Model:          {args.model_dir if args.model_dir else 'first-fit (no ML)'}"
+    )
+    print(f"  Data types:     {', '.join(args.dtypes)}")
+    print(f"  Accumulator:    {args.dtype_acc}")
+    print(f"  Architecture:   {args.arch}")
+    print(f"  Kernel pool:    {len(KERNEL_POOL)} kernels")
+    print(f"  Problem shapes: {len(shapes)}")
+    print(f"  Total tests:    {len(shapes) * len(args.dtypes)}")
+    print(
+        f"  Mode:           {'DRY RUN (prediction only)' if args.dry_run else 'FULL RUN (execute kernels)'}"
+    )
+    print(f"  Output:         {args.output}")
+    print("=" * 80)
+
+    # Open output CSV
+    csv_file = open(args.output, "w", newline="")
+    csv_writer = csv.DictWriter(
+        csv_file,
+        fieldnames=[
+            "dtype",
+            "M",
+            "N",
+            "K",
+            "selected_kernel",
+            "predicted_tflops",
+            "selection_time_ms",
+            "actual_time_ms",
+            "actual_tflops",
+            "status",
+            "error",
+        ],
+    )
+    csv_writer.writeheader()
+
+    # Run sweep
+    total_tests = len(shapes) * len(args.dtypes)
+    completed = 0
+    start_time = time.time()
+
+    print("\nStarting sweep... (Ctrl+C to stop and save partial results)\n")
+
+    try:
+        for dtype in args.dtypes:
+            print(f"\n{'=' * 80}")
+            print(f"  Testing dtype: {dtype.upper()}")
+            print(f"{'=' * 80}\n")
+
+            for i, (M, N, K) in enumerate(shapes):
+                result = run_single_gemm(
+                    M, N, K, dtype, args.arch, predictor, args.dry_run, args.dtype_acc
+                )
+
+                # Write to CSV
+                csv_writer.writerow(result)
+                csv_file.flush()
+
+                completed += 1
+
+                # Progress update
+                if completed % 10 == 0 or result["status"] != "SUCCESS":
+                    elapsed = time.time() - start_time
+                    rate = completed / elapsed if elapsed > 0 else 0
+                    eta = (total_tests - completed) / rate if rate > 0 else 0
+
+                    status_emoji = {
+                        "SUCCESS": "✓",
+                        "SKIP": "→",
+                        "BUILD_FAIL": "✗",
+                        "UNSUPPORTED": "○",
+                        "RUN_FAIL": "✗",
+                        "ERROR": "✗",
+                    }.get(result["status"], "?")
+
+                    print(
+                        f"  [{completed:4d}/{total_tests}] {status_emoji} "
+                        f"{dtype:4s} {M:5d}x{N:5d}x{K:5d} → "
+                        f"{result['selected_kernel']:20s} "
+                        f"pred={result['predicted_tflops']:6.1f} "
+                        f"actual={result['actual_tflops']:6.1f} TFLOPS  "
+                        f"[{rate:.1f} tests/s, ETA {eta / 60:.1f}m]"
+                    )
+
+    except KeyboardInterrupt:
+        print(f"\n\n⚠ Interrupted! Saving partial results to {args.output}...")
+
+    finally:
+        csv_file.close()
+
+    # Summary
+    print("\n" + "=" * 80)
+    print("  SWEEP COMPLETE")
+    print("=" * 80)
+
+    # Read back results and compute statistics
+    results = []
+    with open(args.output, "r") as f:
+        reader = csv.DictReader(f)
+        results = list(reader)
+
+    print(f"\n  Total tests:     {len(results)}")
+    print(f"  Output file:     {args.output}")
+
+    if not args.dry_run:
+        success = [r for r in results if r["status"] == "SUCCESS"]
+        print(
+            f"  Successful:      {len(success)} ({100 * len(success) / len(results):.1f}%)"
+        )
+
+        if success:
+            avg_tflops = np.mean([float(r["actual_tflops"]) for r in success])
+            max_tflops = max([float(r["actual_tflops"]) for r in success])
+            print(f"  Avg TFLOPS:      {avg_tflops:.2f}")
+            print(f"  Max TFLOPS:      {max_tflops:.2f}")
+
+            # Per-dtype breakdown
+            for dtype in args.dtypes:
+                dtype_results = [r for r in success if r["dtype"] == dtype]
+                if dtype_results:
+                    avg = np.mean([float(r["actual_tflops"]) for r in dtype_results])
+                    print(
+                        f"    {dtype:4s}:          {avg:.2f} TFLOPS (n={len(dtype_results)})"
+                    )
+
+    print("=" * 80)
+    print()
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/feature_spec.json b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/feature_spec.json
new file mode 100644
index 0000000000..dc4ed02e5e
--- /dev/null
+++ b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/feature_spec.json
@@ -0,0 +1,113 @@
+{
+  "op_type": "gemm_universal",
+  "dtype": "fp16",
+  "arch": "gfx950",
+  "feature_names": [
+    "M",
+    "N",
+    "K",
+    "split_k",
+    "log2_M",
+    "log2_N",
+    "log2_K",
+    "log2_MNK",
+    "arithmetic_intensity",
+    "aspect_ratio_mn",
+    "aspect_ratio_mk",
+    "aspect_ratio_nk",
+    "layout",
+    "tile_m",
+    "tile_n",
+    "tile_k",
+    "warp_m",
+    "warp_n",
+    "warp_k",
+    "warp_tile_m",
+    "warp_tile_n",
+    "warp_tile_k",
+    "pipeline",
+    "scheduler",
+    "epilogue",
+    "pad_m",
+    "pad_n",
+    "pad_k",
+    "persistent",
+    "num_warps",
+    "tile_volume",
+    "tile_mn",
+    "lds_usage_estimate",
+    "lds_usage_ratio",
+    "num_tiles_m",
+    "num_tiles_n",
+    "num_tiles_k",
+    "total_output_tiles",
+    "tile_eff_m",
+    "tile_eff_n",
+    "tile_eff_k",
+    "overall_tile_efficiency",
+    "cu_utilization",
+    "ratio_M_to_tile_m",
+    "ratio_N_to_tile_n",
+    "ratio_K_to_tile_k",
+    "problem_smaller_than_tile_m",
+    "problem_smaller_than_tile_n",
+    "problem_smaller_than_tile_k",
+    "any_dim_too_small",
+    "needs_padding_m",
+    "needs_padding_n",
+    "needs_padding_k",
+    "has_padding_when_needed_m",
+    "has_padding_when_needed_n",
+    "has_padding_when_needed_k",
+    "missing_required_padding_m",
+    "missing_required_padding_n",
+    "missing_required_padding_k",
+    "missing_any_required_padding",
+    "hw_num_cus",
+    "hw_simds_per_cu",
+    "hw_total_simds",
+    "hw_shader_engines",
+    "hw_max_clock_mhz",
+    "hw_max_waves_per_cu",
+    "hw_wavefront_size",
+    "hw_lds_capacity",
+    "hw_l1_cache_kb",
+    "hw_l2_cache_kb",
+    "hw_l3_cache_kb",
+    "hw_num_xcd"
+  ],
+  "categorical_features": [
+    "layout",
+    "pipeline",
+    "scheduler",
+    "epilogue"
+  ],
+  "targets": [
+    "tflops",
+    "latency",
+    "bandwidth"
+  ],
+  "log_targets": [
+    "bandwidth",
+    "tflops"
+  ],
+  "params": {
+    "objective": "regression",
+    "metric": [
+      "rmse",
+      "mae"
+    ],
+    "num_leaves": 255,
+    "max_depth": 15,
+    "n_estimators": 2000,
+    "learning_rate": 0.02,
+    "min_child_samples": 10,
+    "subsample": 0.85,
+    "colsample_bytree": 0.85,
+    "reg_alpha": 0.05,
+    "reg_lambda": 0.5,
+    "verbose": -1,
+    "n_jobs": 8,
+    "seed": 42
+  }
+}
\ No newline at end of file
diff --git a/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm.gz b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm.gz
new file mode 100644
index 0000000000..a59cc73c4f
Binary files /dev/null and b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm.gz differ
diff --git a/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/train_manifest.json b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/train_manifest.json
new file mode 100644
index 0000000000..7028dc32fa
--- /dev/null
+++ b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/train_manifest.json
@@ -0,0 +1,10 @@
+{
+  "warm_start_from": null,
+  "prev_n_estimators": 0,
+  "new_n_estimators": 2000,
+  "total_n_estimators": 2000,
+  "data_rows": 25600,
+  "valid_rows": 21920,
+  "unique_shapes": 25,
+  "timestamp": "2026-03-20T05:00:55"
+}
\ No newline at end of file
diff --git a/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/feature_spec.json b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/feature_spec.json
new file mode 100644
index 0000000000..ffc4052d9b
--- /dev/null
+++ b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/feature_spec.json
@@ -0,0 +1,113 @@
+{
+  "op_type": "gemm_universal",
+  "dtype": "fp8",
+  "arch": "gfx950",
+  "feature_names": [
+    "M",
+    "N",
+    "K",
+    "split_k",
+    "log2_M",
+    "log2_N",
+    "log2_K",
+    "log2_MNK",
+    "arithmetic_intensity",
+    "aspect_ratio_mn",
+    "aspect_ratio_mk",
+    "aspect_ratio_nk",
+    "layout",
+    "tile_m",
+    "tile_n",
+    "tile_k",
+    "warp_m",
+    "warp_n",
+    "warp_k",
+    "warp_tile_m",
+    "warp_tile_n",
+    "warp_tile_k",
+    "pipeline",
+    "scheduler",
+    "epilogue",
+    "pad_m",
+    "pad_n",
+    "pad_k",
+    "persistent",
+    "num_warps",
+    "tile_volume",
+    "tile_mn",
+    "lds_usage_estimate",
+    "lds_usage_ratio",
+    "num_tiles_m",
+    "num_tiles_n",
+    "num_tiles_k",
+    "total_output_tiles",
+    "tile_eff_m",
+    "tile_eff_n",
+    "tile_eff_k",
+    "overall_tile_efficiency",
+    "cu_utilization",
+    "ratio_M_to_tile_m",
+    "ratio_N_to_tile_n",
+    "ratio_K_to_tile_k",
+    "problem_smaller_than_tile_m",
+    "problem_smaller_than_tile_n",
+    "problem_smaller_than_tile_k",
+    "any_dim_too_small",
+    "needs_padding_m",
+    "needs_padding_n",
+    "needs_padding_k",
+    "has_padding_when_needed_m",
+    "has_padding_when_needed_n",
+    "has_padding_when_needed_k",
+    "missing_required_padding_m",
+    "missing_required_padding_n",
+    "missing_required_padding_k",
+    "missing_any_required_padding",
+    "hw_num_cus",
+    "hw_simds_per_cu",
+    "hw_total_simds",
+    "hw_shader_engines",
+    "hw_max_clock_mhz",
+    "hw_max_waves_per_cu",
+    "hw_wavefront_size",
+    "hw_lds_capacity",
+    "hw_l1_cache_kb",
+    "hw_l2_cache_kb",
+    "hw_l3_cache_kb",
+    "hw_num_xcd"
+  ],
+  "categorical_features": [
+    "layout",
+    "pipeline",
+    "scheduler",
+    "epilogue"
+  ],
+  "targets": [
+    "tflops",
+    "latency",
+    "bandwidth"
+  ],
+  "log_targets": [
+    "bandwidth",
+    "tflops"
+  ],
+  "params": {
+    "objective": "regression",
+    "metric": [
+      "rmse",
+      "mae"
+    ],
+    "num_leaves": 255,
+    "max_depth": 15,
+    "n_estimators": 2000,
+    "learning_rate": 0.02,
+    "min_child_samples": 10,
+    "subsample": 0.85,
+    "colsample_bytree": 0.85,
+    "reg_alpha": 0.05,
+    "reg_lambda": 0.5,
+    "verbose": -1,
+    "n_jobs": 8,
+    "seed": 42
+  }
+}
\ No newline at end of file
diff --git a/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/model_tflops.lgbm.gz b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/model_tflops.lgbm.gz
new file mode 100644
index 0000000000..a2a08ee01a
Binary files /dev/null and b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/model_tflops.lgbm.gz differ
diff --git a/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/train_manifest.json b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/train_manifest.json
new file mode 100644
index 0000000000..d7ce61d2ff
--- /dev/null
+++ b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/train_manifest.json
@@ -0,0 +1,10 @@
+{
+  "warm_start_from": null,
+  "prev_n_estimators": 0,
+  "new_n_estimators": 2000,
+  "total_n_estimators": 2000,
+  "data_rows": 1296528,
+  "valid_rows": 1253076,
+  "unique_shapes": 168,
+  "timestamp": "2026-03-19T06:10:29"
+}
\ No newline at end of file
diff --git a/dispatcher/heuristics/predict.py b/dispatcher/heuristics/predict.py
new file mode 100644
index 0000000000..8738c76f23
--- /dev/null
+++ b/dispatcher/heuristics/predict.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Predictor for CK Tile kernel performance.
+
+Loads trained LightGBM models and provides:
+  - predict_tflops(): predicted TFLOPS for a single (problem, kernel) pair
+  - predict_latency(): predicted latency in ms
+  - predict_bandwidth(): predicted bandwidth in GB/s
+  - predict_all(): all three predictions at once
+  - rank_kernels(): rank all candidate kernels by predicted TFLOPS
+  - select_best(): return the best kernel ID
+
+Usage:
+    predictor = Predictor("models/gemm_universal_fp8_gfx950")
+    best_kernel = predictor.select_best(
+        problem={"m": 128, "n": 1536, "k": 7168, "dtype": "fp8", "layout": "rcr"},
+        kernel_configs=[...],
+    )
+"""
+
+import gzip
+import json
+from pathlib import Path
+from typing import Optional
+
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+
+from feature_engine import GemmUniversalFeatureEngine
+
+
+class Predictor:
+    """Loads trained models and feature spec for kernel performance prediction.
+
+    Parameters
+    ----------
+    model_dir : str or Path
+        Directory containing model artifacts:
+        - model_tflops.lgbm (required)
+        - model_latency.lgbm (optional)
+        - model_bandwidth.lgbm (optional)
+        - feature_spec.json (required)
+
+    feature_engine : FeatureEngine, optional
+        Override the feature engine. If None, constructs one from feature_spec.json.
+    """
+
+    def __init__(self, model_dir: str | Path, feature_engine=None):
+        self._model_dir = Path(model_dir)
+        self._models: dict[str, lgb.Booster] = {}
+
+        spec_path = self._model_dir / "feature_spec.json"
+        if spec_path.exists():
+            with open(spec_path) as f:
+                self._spec = json.load(f)
+        else:
+            self._spec = {}
+
+        self._log_targets = set(self._spec.get("log_targets", []))
+
+        if feature_engine is not None:
+            self._feature_engine = feature_engine
+        else:
+            self._feature_engine = GemmUniversalFeatureEngine()
+
+    def _load_model(self, target: str) -> Optional[lgb.Booster]:
+        """Lazy-load a model for the given target.
+
+        Automatically decompresses .lgbm.gz files if the .lgbm file doesn't exist.
+        The decompressed file is cached to disk for subsequent loads.
+        """
+        if target in self._models:
+            return self._models[target]
+
+        path = self._model_dir / f"model_{target}.lgbm"
+        gz_path = self._model_dir / f"model_{target}.lgbm.gz"
+
+        # Auto-decompress if needed
+        if not path.exists() and gz_path.exists():
+            with gzip.open(gz_path, 'rb') as f_in:
+                with open(path, 'wb') as f_out:
+                    f_out.write(f_in.read())
+
+        if not path.exists():
+            return None
+
+        model = lgb.Booster(model_file=str(path))
+        self._models[target] = model
+        return model
+
+    def _predict_single(self, target: str, problem: dict, kernel_config: dict) -> float:
+        """Predict a single target value, applying inverse log transform if needed."""
+        model = self._load_model(target)
+        if model is None:
+            raise FileNotFoundError(f"No model_{target}.lgbm in {self._model_dir}")
+        features = self._feature_engine.extract(problem, kernel_config)
+        raw = float(model.predict(features.reshape(1, -1))[0])
+        if target in self._log_targets:
+            return float(np.expm1(raw))
+        # Clamp to non-negative even for non-log models
+        return float(max(0.0, raw))
+
+    def predict_tflops(self, problem: dict, kernel_config: dict) -> float:
+        """Predict TFLOPS for a single (problem, kernel) pair.
+
+        Returns a real TFLOPS estimate (interpretable, usable as DE surrogate).
+        If the model was trained in log-space, the inverse transform is applied
+        automatically.
+        """
+        return self._predict_single("tflops", problem, kernel_config)
+
+    def predict_latency(self, problem: dict, kernel_config: dict) -> float:
+        """Predict latency in milliseconds for a single (problem, kernel) pair."""
+        return self._predict_single("latency", problem, kernel_config)
+
+    def predict_bandwidth(self, problem: dict, kernel_config: dict) -> float:
+        """Predict bandwidth in GB/s for a single (problem, kernel) pair."""
+        return self._predict_single("bandwidth", problem, kernel_config)
+
+    def predict_all(self, problem: dict, kernel_config: dict) -> dict[str, float]:
+        """Predict all available targets for a single (problem, kernel) pair.
+
+        Returns dict with keys 'tflops', 'latency_ms', 'bandwidth_gb_s' (if models exist).
+
+        Note: Applies inverse log transform for targets in log_targets and clamps
+        negatives to 0.0, consistent with _predict_single().
+        """
+        features = self._feature_engine.extract(problem, kernel_config).reshape(1, -1)
+        result = {}
+        for target, key in [
+            ("tflops", "tflops"),
+            ("latency", "latency_ms"),
+            ("bandwidth", "bandwidth_gb_s"),
+        ]:
+            model = self._load_model(target)
+            if model is not None:
+                raw = float(model.predict(features)[0])
+                # Apply inverse log transform if model was trained in log-space
+                if target in self._log_targets:
+                    result[key] = float(np.expm1(raw))
+                else:
+                    # Clamp to non-negative even for non-log models
+                    result[key] = float(max(0.0, raw))
+        return result
+
+    def rank_kernels(
+        self, problem: dict, kernel_configs: list[dict]
+    ) -> list[tuple[str, float]]:
+        """Rank candidate kernels by predicted TFLOPS (descending).
+
+        Parameters
+        ----------
+        problem : dict
+            Problem specification with keys: m, n, k, dtype, layout, split_k.
+        kernel_configs : list of dict
+            Each dict must have a 'kernel_name' key plus kernel parameters.
+
+        Returns
+        -------
+        list of (kernel_name, predicted_tflops) tuples, sorted descending.
+        """
+        if not kernel_configs:
+            return []
+
+        model = self._load_model("tflops")
+        if model is None:
+            raise FileNotFoundError(f"No model_tflops.lgbm in {self._model_dir}")
+
+        rows = []
+        for kc in kernel_configs:
+            merged = {**problem, **kc}
+            rows.append(merged)
+
+        df = pd.DataFrame(rows)
+        X = self._feature_engine.extract_batch(df)
+        preds = model.predict(X)
+        if "tflops" in self._log_targets:
+            preds = np.expm1(preds)
+
+        results = []
+        for i, kc in enumerate(kernel_configs):
+            name = kc.get("kernel_name", f"kernel_{i}")
+            results.append((name, float(preds[i])))
+
+        results.sort(key=lambda x: -x[1])
+        return results
+
+    def select_best(self, problem: dict, kernel_configs: list[dict]) -> str:
+        """Return the kernel_name of the best predicted kernel."""
+        ranked = self.rank_kernels(problem, kernel_configs)
+        if not ranked:
+            raise ValueError("No kernel configs provided")
+        return ranked[0][0]
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Predict kernel performance")
+    parser.add_argument(
+        "--model_dir", required=True, help="Directory with trained models"
+    )
+    parser.add_argument("--m", type=int, required=True)
+    parser.add_argument("--n", type=int, required=True)
+    parser.add_argument("--k", type=int, required=True)
+    parser.add_argument("--layout", default="rcr")
+    parser.add_argument("--dtype", default="fp8")
+    args = parser.parse_args()
+
+    predictor = Predictor(args.model_dir)
+    problem = {
+        "m": args.m,
+        "n": args.n,
+        "k": args.k,
+        "dtype": args.dtype,
+        "layout": args.layout,
+        "split_k": 1,
+    }
+
+    print(f"Loading models from {args.model_dir}...")
+    print(
+        f"Problem: M={args.m} N={args.n} K={args.k} dtype={args.dtype} layout={args.layout}"
+    )
+
+    data_dir = Path(args.model_dir).parent.parent / "data"
+    if data_dir.exists():
+        for pq in data_dir.glob("*.parquet"):
+            df = pd.read_parquet(pq)
+            kernel_names = df["kernel_name"].unique()
+            configs = []
+            for kn in kernel_names[:10]:
+                row = df[df["kernel_name"] == kn].iloc[0]
+                configs.append(row.to_dict())
+            if configs:
+                ranked = predictor.rank_kernels(problem, configs)
+                print(f"\nTop 5 kernels (from {len(configs)} candidates):")
+                for name, tflops in ranked[:5]:
+                    print(f"  {tflops:8.2f} TFLOPS  {name}")
+                break
diff --git a/dispatcher/heuristics/search.py b/dispatcher/heuristics/search.py
new file mode 100644
index 0000000000..f9b7e13b09
--- /dev/null
+++ b/dispatcher/heuristics/search.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Surrogate search for CK Tile kernel configuration optimization.
+
+Uses a trained LGBMRegressor as a cheap surrogate function to search the
+discrete kernel parameter space (tile sizes, warp config, pipeline, etc.)
+without running actual GPU benchmarks.
+
+Strategies:
+  - 'random': Sample N random valid configs, score all, return top-K.
+  - 'de': Discrete Differential Evolution with mutation over valid parameter choices.
+
+Usage:
+    from search import SurrogateSearch
+    from predict import Predictor
+
+    predictor = Predictor("models/gemm_universal_fp8_gfx950")
+    searcher = SurrogateSearch(predictor, strategy='random')
+    results = searcher.search(
+        problem={"m": 128, "n": 1536, "k": 7168, "dtype": "fp8", "layout": "rcr"},
+        budget=500,
+    )
+    # results: [(config_dict, predicted_tflops), ...] sorted descending
+"""
+
+import random
+from typing import Optional
+
+import numpy as np
+
+from feature_engine import GemmUniversalFeatureEngine
+from predict import Predictor
+
+
+class SurrogateSearch:
+    """Search kernel parameter space using ML regressor as surrogate objective.
+
+    Parameters
+    ----------
+    predictor : Predictor
+        Trained predictor with a TFLOPS model.
+    feature_engine : GemmUniversalFeatureEngine, optional
+        Feature engine for parameter space and validation. If None, uses default.
+    strategy : str
+        Search strategy: 'random' or 'de' (Discrete Differential Evolution).
+    seed : int
+        Random seed for reproducibility.
+    """
+
+    def __init__(
+        self,
+        predictor: Predictor,
+        feature_engine: Optional[GemmUniversalFeatureEngine] = None,
+        strategy: str = "random",
+        seed: int = 42,
+    ):
+        self._predictor = predictor
+        self._fe = feature_engine or GemmUniversalFeatureEngine()
+        self._strategy = strategy
+        self._rng = random.Random(seed)
+        self._np_rng = np.random.RandomState(seed)
+        self._param_space = self._fe.get_parameter_space()
+
+    def _sample_random_config(self) -> dict:
+        """Sample a single random config from the parameter space."""
+        config = {}
+        for param, values in self._param_space.items():
+            config[param] = self._rng.choice(values)
+        return config
+
+    def _sample_valid_config(self, max_attempts: int = 50) -> Optional[dict]:
+        """Sample a random config that passes all validation constraints."""
+        for _ in range(max_attempts):
+            config = self._sample_random_config()
+            if self._fe.validate_config(config):
+                return config
+        return None
+
+    def _score_config(self, problem: dict, config: dict) -> float:
+        """Score a config using the predictor."""
+        return self._predictor.predict_tflops(problem, config)
+
+    def _search_random(
+        self, problem: dict, budget: int, top_k: int
+    ) -> list[tuple[dict, float]]:
+        """Random search: sample valid configs, score all, return top-K."""
+        configs = []
+        for _ in range(budget):
+            cfg = self._sample_valid_config()
+            if cfg is not None:
+                configs.append(cfg)
+
+        if not configs:
+            return []
+
+        scored = []
+        for cfg in configs:
+            try:
+                score = self._score_config(problem, cfg)
+                scored.append((cfg, score))
+            except Exception:
+                continue
+
+        scored.sort(key=lambda x: -x[1])
+        return scored[:top_k]
+
+    def _search_de(
+        self,
+        problem: dict,
+        budget: int,
+        top_k: int,
+        pop_size: int = 20,
+        mutation_rate: float = 0.3,
+        crossover_rate: float = 0.7,
+    ) -> list[tuple[dict, float]]:
+        """Discrete Differential Evolution.
+
+        Uses discrete mutation: randomly swap parameters to other valid values
+        from the parameter space (no continuous relaxation + snap).
+
+        Each generation:
+          1. For each member of the population, create a trial vector by:
+             - Selecting 3 random other members (a, b, c)
+             - For each parameter, with probability mutation_rate, take the value
+               from a, b, or c (uniform choice among the three donors)
+             - With probability crossover_rate, take the trial value; otherwise keep original
+          2. Validate the trial; if invalid, resample that parameter from the space
+          3. Score the trial; if better than parent, replace
+        """
+        param_names = list(self._param_space.keys())
+
+        population = []
+        for _ in range(pop_size):
+            cfg = self._sample_valid_config()
+            if cfg is not None:
+                score = self._score_config(problem, cfg)
+                population.append((cfg, score))
+
+        if len(population) < 4:
+            return self._search_random(problem, budget, top_k)
+
+        evals_used = len(population)
+        max_gens = (budget - evals_used) // pop_size
+
+        for gen in range(max_gens):
+            new_pop = []
+            for i, (parent, parent_score) in enumerate(population):
+                candidates = [j for j in range(len(population)) if j != i]
+                if len(candidates) < 3:
+                    new_pop.append((parent, parent_score))
+                    continue
+
+                a_idx, b_idx, c_idx = self._rng.sample(candidates, 3)
+                a, b, c = (
+                    population[a_idx][0],
+                    population[b_idx][0],
+                    population[c_idx][0],
+                )
+
+                trial = dict(parent)
+                for param in param_names:
+                    if self._rng.random() < mutation_rate:
+                        donor = self._rng.choice([a, b, c])
+                        trial[param] = donor.get(param, parent.get(param))
+
+                    if self._rng.random() > crossover_rate:
+                        trial[param] = parent.get(param)
+
+                if not self._fe.validate_config(trial):
+                    for param in param_names:
+                        if param in trial and trial[param] not in self._param_space.get(
+                            param, [trial[param]]
+                        ):
+                            trial[param] = self._rng.choice(self._param_space[param])
+                    if not self._fe.validate_config(trial):
+                        new_pop.append((parent, parent_score))
+                        continue
+
+                try:
+                    trial_score = self._score_config(problem, trial)
+                    evals_used += 1
+                except Exception:
+                    new_pop.append((parent, parent_score))
+                    continue
+
+                if trial_score > parent_score:
+                    new_pop.append((trial, trial_score))
+                else:
+                    new_pop.append((parent, parent_score))
+
+            population = new_pop
+
+        population.sort(key=lambda x: -x[1])
+        return population[:top_k]
+
+    def search(
+        self,
+        problem: dict,
+        budget: int = 500,
+        top_k: int = 10,
+        **kwargs,
+    ) -> list[tuple[dict, float]]:
+        """Search the kernel parameter space for the best configuration.
+
+        Parameters
+        ----------
+        problem : dict
+            Problem specification: m, n, k, dtype, layout, split_k.
+        budget : int
+            Maximum number of surrogate evaluations.
+        top_k : int
+            Number of top configurations to return.
+        **kwargs
+            Strategy-specific parameters (pop_size, mutation_rate, etc.).
+
+        Returns
+        -------
+        list of (config_dict, predicted_tflops), sorted descending by TFLOPS.
+        """
+        if self._strategy == "random":
+            return self._search_random(problem, budget, top_k)
+        elif self._strategy == "de":
+            return self._search_de(problem, budget, top_k, **kwargs)
+        else:
+            raise ValueError(f"Unknown strategy: {self._strategy}")
+
+
+if __name__ == "__main__":
+    import argparse
+    import time
+
+    parser = argparse.ArgumentParser(
+        description="Surrogate search for optimal kernel config"
+    )
+    parser.add_argument("--model_dir", required=True)
+    parser.add_argument("--m", type=int, required=True)
+    parser.add_argument("--n", type=int, required=True)
+    parser.add_argument("--k", type=int, required=True)
+    parser.add_argument("--dtype", default="fp8")
+    parser.add_argument("--layout", default="rcr")
+    parser.add_argument("--strategy", default="random", choices=["random", "de"])
+    parser.add_argument("--budget", type=int, default=500)
+    parser.add_argument("--top_k", type=int, default=10)
+    args = parser.parse_args()
+
+    predictor = Predictor(args.model_dir)
+    searcher = SurrogateSearch(predictor, strategy=args.strategy)
+    problem = {
+        "m": args.m,
+        "n": args.n,
+        "k": args.k,
+        "dtype": args.dtype,
+        "layout": args.layout,
+        "split_k": 1,
+    }
+
+    print(f"Searching with strategy={args.strategy}, budget={args.budget}...")
+    t0 = time.time()
+    results = searcher.search(problem, budget=args.budget, top_k=args.top_k)
+    elapsed = time.time() - t0
+
+    print(f"\nTop {len(results)} configs found in {elapsed * 1000:.1f}ms:")
+    for i, (cfg, tflops) in enumerate(results):
+        tile_str = f"{cfg.get('tile_m', '?')}x{cfg.get('tile_n', '?')}x{cfg.get('tile_k', '?')}"
+        warp_str = f"{cfg.get('warp_m', '?')}x{cfg.get('warp_n', '?')}x{cfg.get('warp_k', '?')}"
+        print(
+            f"  #{i + 1}: {tflops:8.2f} TFLOPS  tile={tile_str} warp={warp_str} "
+            f"pipeline={cfg.get('pipeline', '?')} scheduler={cfg.get('scheduler', '?')}"
+        )
diff --git a/dispatcher/heuristics/tests/__init__.py b/dispatcher/heuristics/tests/__init__.py
new file mode 100644
index 0000000000..1df4857184
--- /dev/null
+++ b/dispatcher/heuristics/tests/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
diff --git a/dispatcher/heuristics/tests/test_data_pipeline.py b/dispatcher/heuristics/tests/test_data_pipeline.py
new file mode 100644
index 0000000000..d643138693
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_data_pipeline.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for data_pipeline.py.
+
+Covers: kernel name parsing, layout derivation, streaming log parsing,
+schema validation, and corner cases (empty logs, malformed JSON, single-shape).
+"""
+
+import tempfile
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+import sys
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from data_pipeline import (
+    parse_kernel_name,
+    _layout_from_problem,
+    parse_streaming_log,
+    save_parquet,
+    load_parquet,
+    CANONICAL_COLUMNS,
+)
+
+
+# ---------------------------------------------------------------------------
+# parse_kernel_name
+# ---------------------------------------------------------------------------
+
+
+class TestParseKernelName:
+    def test_standard_name(self):
+        name = "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128"
+        result = parse_kernel_name(name)
+        assert result["dtype"] == "fp8"
+        assert result["layout"] == "rcr"
+        assert result["pipeline"] == "compv3"
+        assert result["epilogue"] == "cshuffle"
+        assert result["scheduler"] == "intrawave"
+        assert result["pad_m"] is False
+        assert result["pad_n"] is False
+        assert result["pad_k"] is False
+        assert result["persistent"] is False
+        assert result["tile_m"] == 128
+        assert result["tile_n"] == 128
+        assert result["tile_k"] == 128
+        assert result["warp_m"] == 1
+        assert result["warp_n"] == 4
+        assert result["warp_k"] == 1
+        assert result["warp_tile_m"] == 16
+        assert result["warp_tile_n"] == 16
+        assert result["warp_tile_k"] == 128
+
+    def test_with_padding_and_persistent(self):
+        name = "gemm_universal_fp16_rrr_compv4_default_interwave_True_True_True_True_256x256x64_2x2x1_32x32x16"
+        result = parse_kernel_name(name)
+        assert result["dtype"] == "fp16"
+        assert result["layout"] == "rrr"
+        assert result["pad_m"] is True
+        assert result["pad_n"] is True
+        assert result["pad_k"] is True
+        assert result["persistent"] is True
+        assert result["tile_m"] == 256
+
+    def test_empty_name(self):
+        assert parse_kernel_name("") == {}
+
+    def test_malformed_name(self):
+        assert parse_kernel_name("not_a_kernel_name") == {}
+
+    def test_partial_name(self):
+        result = parse_kernel_name("gemm_universal_fp8_rcr_compv3")
+        assert result.get("dtype") == "fp8"
+        assert result.get("layout") == "rcr"
+        assert "tile_m" not in result  # not enough parts
+
+    def test_all_layouts(self):
+        for layout in ["rcr", "rrr", "crr", "ccr"]:
+            name = f"gemm_universal_fp8_{layout}_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128"
+            result = parse_kernel_name(name)
+            assert result["layout"] == layout
+
+
+# ---------------------------------------------------------------------------
+# _layout_from_problem
+# ---------------------------------------------------------------------------
+
+
+class TestLayoutFromProblem:
+    def test_rcr(self):
+        assert (
+            _layout_from_problem(
+                {
+                    "layout_a": "RowMajor",
+                    "layout_b": "ColumnMajor",
+                    "layout_c": "RowMajor",
+                }
+            )
+            == "rcr"
+        )
+
+    def test_rrr(self):
+        assert (
+            _layout_from_problem(
+                {"layout_a": "RowMajor", "layout_b": "RowMajor", "layout_c": "RowMajor"}
+            )
+            == "rrr"
+        )
+
+    def test_empty(self):
+        assert _layout_from_problem({}) == "???"
+
+    def test_case_insensitive(self):
+        assert (
+            _layout_from_problem(
+                {
+                    "layout_a": "rowmajor",
+                    "layout_b": "COLUMNMAJOR",
+                    "layout_c": "RowMajor",
+                }
+            )
+            == "rcr"
+        )
+
+
+# ---------------------------------------------------------------------------
+# parse_streaming_log
+# ---------------------------------------------------------------------------
+
+SAMPLE_LOG = """\
+================================================================================
+LOG FILE: test.log
+================================================================================
+CK Tile Profiling Run
+GPU ID: 0
+
+--- Running CK Tile benchmarks on GPU 0 ---
+
+========================================
+Shape 1: M=16 N=1536 K=7168 dtype=fp8 layout=rcr
+========================================
+Found 2 kernels
+{
+ "name": "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128",
+ "problem": {
+   "split_k":1,
+   "m":16,
+   "n":1536,
+   "k":7168,
+   "stride_a":7168,
+   "stride_b":7168,
+   "stride_c":1536,
+   "dtype_a":"fp8",
+   "dtype_b":"fp8",
+   "dtype_acc":"fp32",
+   "dtype_c":"fp16",
+   "layout_a":"RowMajor",
+   "layout_b":"ColumnMajor",
+   "layout_c":"RowMajor",
+   "structured_sparsity":false
+},
+ "perf_result": {
+   "latency(ms)": 0.04,
+   "tflops(TFlops)": 8.81,
+   "bandwidth(GB/s)": 279.51
+}
+}
+{
+ "name": "gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_128x128x64_2x2x1_32x32x16",
+ "problem": {
+   "split_k":1,
+   "m":16,
+   "n":1536,
+   "k":7168,
+   "stride_a":7168,
+   "stride_b":7168,
+   "stride_c":1536,
+   "dtype_a":"fp8",
+   "dtype_b":"fp8",
+   "dtype_acc":"fp32",
+   "dtype_c":"fp16",
+   "layout_a":"RowMajor",
+   "layout_b":"ColumnMajor",
+   "layout_c":"RowMajor",
+   "structured_sparsity":false
+},
+ "perf_result": {
+   "latency(ms)": 0.05,
+   "tflops(TFlops)": 7.22,
+   "bandwidth(GB/s)": 228.85
+}
+}
+
+========================================
+Shape 2: M=20480 N=7168 K=256 dtype=fp8 layout=rcr
+========================================
+Found 1 kernels
+{
+ "name": "gemm_universal_fp8_rcr_mem_cshuffle_intrawave_False_False_False_True_64x64x128_1x4x1_16x16x32",
+ "problem": {
+   "split_k":1,
+   "m":20480,
+   "n":7168,
+   "k":256,
+   "stride_a":256,
+   "stride_b":256,
+   "stride_c":7168,
+   "dtype_a":"fp8",
+   "dtype_b":"fp8",
+   "dtype_acc":"fp32",
+   "dtype_c":"fp16",
+   "layout_a":"RowMajor",
+   "layout_b":"ColumnMajor",
+   "layout_c":"RowMajor",
+   "structured_sparsity":false
+},
+ "perf_result": {
+   "latency(ms)": 0.15,
+   "tflops(TFlops)": 505.00,
+   "bandwidth(GB/s)": 1200.50
+}
+}
+"""
+
+
+class TestParseStreamingLog:
+    def _write_log(self, content: str) -> Path:
+        f = tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False)
+        f.write(content)
+        f.close()
+        return Path(f.name)
+
+    def test_basic_parse(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path, arch="gfx950")
+        assert len(df) == 3
+        assert df["arch"].iloc[0] == "gfx950"
+        assert df["m"].tolist() == [16, 16, 20480]
+        assert df["n"].tolist() == [1536, 1536, 7168]
+        assert df["k"].tolist() == [7168, 7168, 256]
+
+    def test_tflops_values(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path)
+        assert df["measured_tflops"].tolist() == pytest.approx([8.81, 7.22, 505.0])
+
+    def test_kernel_config_parsed(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path)
+        assert df["tile_m"].iloc[0] == 128
+        assert df["pipeline"].iloc[0] == "compv3"
+        assert df["pipeline"].iloc[1] == "compv4"
+
+    def test_layout_derived_from_json(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path)
+        assert all(df["layout"] == "rcr")
+
+    def test_empty_log(self):
+        path = self._write_log("No shapes here\nJust noise\n")
+        df = parse_streaming_log(path)
+        assert len(df) == 0
+        for col in CANONICAL_COLUMNS:
+            assert col in df.columns
+
+    def test_single_kernel(self):
+        log = """\
+Shape 1: M=1 N=1 K=1 dtype=fp8 layout=rcr
+{
+ "name": "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128",
+ "problem": {"split_k":1, "m":1, "n":1, "k":1, "dtype_a":"fp8", "dtype_b":"fp8", "layout_a":"RowMajor", "layout_b":"ColumnMajor", "layout_c":"RowMajor"},
+ "perf_result": {"latency(ms)": 0.001, "tflops(TFlops)": 0.002, "bandwidth(GB/s)": 0.01}
+}
+"""
+        path = self._write_log(log)
+        df = parse_streaming_log(path)
+        assert len(df) == 1
+        assert df["m"].iloc[0] == 1
+        assert bool(df["is_valid"].iloc[0]) is True
+
+    def test_zero_tflops_marked_invalid(self):
+        log = """\
+Shape 1: M=16 N=16 K=16 dtype=fp8 layout=rcr
+{
+ "name": "test_kernel",
+ "problem": {"split_k":1, "m":16, "n":16, "k":16, "dtype_a":"fp8"},
+ "perf_result": {"latency(ms)": 0.0, "tflops(TFlops)": 0.0, "bandwidth(GB/s)": 0.0}
+}
+"""
+        path = self._write_log(log)
+        df = parse_streaming_log(path)
+        assert len(df) == 1
+        assert bool(df["is_valid"].iloc[0]) is False
+
+    def test_malformed_json_skipped(self):
+        log = """\
+Shape 1: M=16 N=16 K=16 dtype=fp8 layout=rcr
+{
+ "name": "good_kernel",
+ "problem": {"split_k":1, "m":16, "n":16, "k":16, "dtype_a":"fp8"},
+ "perf_result": {"latency(ms)": 0.01, "tflops(TFlops)": 1.0, "bandwidth(GB/s)": 10.0}
+}
+{ this is not valid json }
+{
+ "name": "another_good",
+ "problem": {"split_k":1, "m":16, "n":16, "k":16, "dtype_a":"fp8"},
+ "perf_result": {"latency(ms)": 0.02, "tflops(TFlops)": 2.0, "bandwidth(GB/s)": 20.0}
+}
+"""
+        path = self._write_log(log)
+        df = parse_streaming_log(path)
+        assert len(df) == 2
+
+    def test_extreme_shapes(self):
+        """Tiny M=1 (single token) and very large M=20480."""
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path)
+        assert 1 not in df["m"].values  # sample has M=16, M=20480
+        assert 16 in df["m"].values
+        assert 20480 in df["m"].values
+
+    def test_run_id_assigned(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path, run_id="test_run_123")
+        assert all(df["run_id"] == "test_run_123")
+
+    def test_op_type_assigned(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path, op_type="gemm_streamk")
+        assert all(df["op_type"] == "gemm_streamk")
+
+
+# ---------------------------------------------------------------------------
+# Parquet round-trip
+# ---------------------------------------------------------------------------
+
+
+class TestParquetIO:
+    def test_round_trip(self, tmp_path):
+        df = pd.DataFrame(
+            {
+                "m": [16, 32],
+                "n": [1536, 1536],
+                "k": [7168, 7168],
+                "measured_tflops": [8.81, 15.0],
+            }
+        )
+        path = tmp_path / "test.parquet"
+        save_parquet(df, path)
+        loaded = load_parquet(path)
+        assert len(loaded) == 2
+        assert loaded["m"].tolist() == [16, 32]
+
+    def test_creates_parent_dirs(self, tmp_path):
+        path = tmp_path / "sub" / "dir" / "test.parquet"
+        df = pd.DataFrame({"x": [1]})
+        save_parquet(df, path)
+        assert path.exists()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_dispatcher_integration.py b/dispatcher/heuristics/tests/test_dispatcher_integration.py
new file mode 100644
index 0000000000..a80438629d
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_dispatcher_integration.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for dispatcher_integration.py.
+
+Covers: kernel name parsing to feature dict, feature dict to dispatcher config
+(name mapping inversion), MLKernelSpec creation, binary pool loading, and
+the ML heuristic function.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from dispatcher_integration import (
+    kernel_config_to_feature_dict,
+    feature_dict_to_dispatcher_config,
+    feature_dict_to_ml_spec,
+    ml_spec_to_dispatcher_config,
+    create_ml_heuristic,
+    load_kernel_pool_from_binaries,
+    MLKernelSpec,
+    LAYOUT_TO_DISPATCHER,
+)
+from feature_engine import GemmUniversalFeatureEngine
+
+
+SAMPLE_KERNEL_NAME = (
+    "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave"
+    "_False_False_False_False_128x128x128_1x4x1_16x16x128"
+)
+
+
+# ---------------------------------------------------------------------------
+# kernel_config_to_feature_dict
+# ---------------------------------------------------------------------------
+
+
+class TestKernelConfigToFeatureDict:
+    def test_parses_standard_name(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        assert feat["tile_m"] == 128
+        assert feat["tile_n"] == 128
+        assert feat["tile_k"] == 128
+        assert feat["warp_m"] == 1  # warps per block
+        assert feat["warp_n"] == 4
+        assert feat["warp_k"] == 1
+        assert feat["warp_tile_m"] == 16
+        assert feat["warp_tile_n"] == 16
+        assert feat["warp_tile_k"] == 128
+        assert feat["pipeline"] == "compv3"
+        assert feat["scheduler"] == "intrawave"
+        assert feat["epilogue"] == "cshuffle"
+        assert feat["kernel_name"] == SAMPLE_KERNEL_NAME
+
+    def test_empty_name_returns_empty(self):
+        assert kernel_config_to_feature_dict("") == {}
+
+    def test_invalid_name_returns_empty(self):
+        assert kernel_config_to_feature_dict("not_a_kernel") == {}
+
+
+# ---------------------------------------------------------------------------
+# Name mapping: feature dict <-> dispatcher config
+# ---------------------------------------------------------------------------
+
+
+class TestNameMapping:
+    """The critical inversion: feature engine warp_m/n/k (warps per block)
+    maps to dispatcher wave_m/n/k, and feature engine warp_tile_m/n/k
+    maps to dispatcher warp_m/n/k."""
+
+    def test_warp_to_wave_mapping(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat)
+        assert disp["wave_m"] == feat["warp_m"]  # 1
+        assert disp["wave_n"] == feat["warp_n"]  # 4
+        assert disp["wave_k"] == feat["warp_k"]  # 1
+
+    def test_warp_tile_to_warp_mapping(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat)
+        assert disp["warp_m"] == feat["warp_tile_m"]  # 16
+        assert disp["warp_n"] == feat["warp_tile_n"]  # 16
+        assert disp["warp_k"] == feat["warp_tile_k"]  # 128
+
+    def test_tile_dims_pass_through(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat)
+        assert disp["tile_m"] == 128
+        assert disp["tile_n"] == 128
+        assert disp["tile_k"] == 128
+
+    def test_pipeline_passes_through(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat)
+        assert disp["pipeline"] == "compv3"
+        assert disp["scheduler"] == "intrawave"
+        assert disp["epilogue"] == "cshuffle"
+
+    def test_rcr_layout_mapping(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat, dtype="fp8")
+        assert disp["layout_a"] == "row"
+        assert disp["layout_b"] == "col"
+        assert disp["layout_c"] == "row"
+
+    def test_all_layouts(self):
+        for layout, (la, lb, lc) in LAYOUT_TO_DISPATCHER.items():
+            feat = {"layout": layout, "tile_m": 128}
+            disp = feature_dict_to_dispatcher_config(feat)
+            assert disp["layout_a"] == la
+            assert disp["layout_b"] == lb
+            assert disp["layout_c"] == lc
+
+
+# ---------------------------------------------------------------------------
+# MLKernelSpec
+# ---------------------------------------------------------------------------
+
+
+class TestMLKernelSpec:
+    def test_from_feature_dict(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        spec = feature_dict_to_ml_spec(feat, predicted_tflops=123.4)
+        assert spec.kernel_name == SAMPLE_KERNEL_NAME
+        assert spec.predicted_tflops == 123.4
+        assert spec.tile_m == 128
+        assert spec.wave_m == 1  # was warp_m in feature space
+        assert spec.warp_m == 16  # was warp_tile_m in feature space
+
+    def test_spec_to_dispatcher_config(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        spec = feature_dict_to_ml_spec(feat, 100.0)
+        disp = ml_spec_to_dispatcher_config(spec, dtype="fp8", arch="gfx950")
+        assert disp["tile_m"] == 128
+        assert disp["wave_m"] == 1
+        assert disp["warp_m"] == 16
+        assert disp["gfx_arch"] == "gfx950"
+        assert disp["dtype_a"] == "fp8"
+
+    def test_roundtrip_preserves_values(self):
+        """feature_dict -> MLKernelSpec -> dispatcher_config should be consistent."""
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        spec = feature_dict_to_ml_spec(feat, 0.0)
+        disp_from_spec = ml_spec_to_dispatcher_config(spec)
+        disp_from_feat = feature_dict_to_dispatcher_config(feat)
+        for key in [
+            "tile_m",
+            "tile_n",
+            "tile_k",
+            "wave_m",
+            "wave_n",
+            "wave_k",
+            "warp_m",
+            "warp_n",
+            "warp_k",
+            "pipeline",
+            "scheduler",
+            "epilogue",
+        ]:
+            assert disp_from_spec[key] == disp_from_feat[key], f"Mismatch on {key}"
+
+
+# ---------------------------------------------------------------------------
+# Binary pool loading
+# ---------------------------------------------------------------------------
+
+
+class TestLoadKernelPool:
+    def test_loads_from_real_bin_dir(self):
+        bin_dir = Path("/workspace/ck_tile/bin")
+        if not bin_dir.exists():
+            pytest.skip("No /workspace/ck_tile/bin")
+        pool = load_kernel_pool_from_binaries(bin_dir)
+        assert len(pool) > 0
+        assert "tile_m" in pool[0]
+        assert "kernel_name" in pool[0]
+
+    def test_empty_dir_returns_empty(self, tmp_path):
+        pool = load_kernel_pool_from_binaries(tmp_path)
+        assert pool == []
+
+
+# ---------------------------------------------------------------------------
+# ML heuristic function
+# ---------------------------------------------------------------------------
+
+
+class TestCreateMLHeuristic:
+    @pytest.fixture
+    def mock_model_dir(self, tmp_path):
+        """Create a minimal model for testing the heuristic flow."""
+        fe = GemmUniversalFeatureEngine()
+        n_features = len(fe.get_feature_names())
+        np.random.seed(42)
+        X = np.random.rand(100, n_features)
+        y = np.random.rand(100) * 500
+        model = lgb.LGBMRegressor(n_estimators=5, verbose=-1)
+        model.fit(X, y)
+        model.booster_.save_model(str(tmp_path / "model_tflops.lgbm"))
+        spec = {
+            "feature_names": fe.get_feature_names(),
+            "categorical_features": fe.get_categorical_features(),
+        }
+        with open(tmp_path / "feature_spec.json", "w") as f:
+            json.dump(spec, f)
+        return tmp_path
+
+    def _make_pool(self):
+        """Create a small synthetic kernel pool."""
+        names = [
+            "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128",
+            "gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_128x128x64_2x2x1_32x32x16",
+            "gemm_universal_fp8_rcr_mem_cshuffle_interwave_False_False_False_False_64x64x128_1x4x1_16x16x32",
+        ]
+        return [kernel_config_to_feature_dict(n) for n in names]
+
+    def test_returns_ml_kernel_spec(self, mock_model_dir):
+        pool = self._make_pool()
+        heuristic = create_ml_heuristic(mock_model_dir, kernel_pool=pool)
+        result = heuristic(1024, 1024, 1024)
+        assert isinstance(result, MLKernelSpec)
+        assert result.tile_m > 0
+        assert isinstance(result.predicted_tflops, float)
+
+    def test_returns_valid_kernel_from_pool(self, mock_model_dir):
+        pool = self._make_pool()
+        pool_names = {p["kernel_name"] for p in pool}
+        heuristic = create_ml_heuristic(mock_model_dir, kernel_pool=pool)
+        result = heuristic(1024, 1024, 1024)
+        assert result.kernel_name in pool_names
+
+    def test_different_shapes_may_select_different_kernels(self, mock_model_dir):
+        pool = self._make_pool()
+        heuristic = create_ml_heuristic(mock_model_dir, kernel_pool=pool)
+        r1 = heuristic(16, 1536, 7168)
+        r2 = heuristic(8192, 8192, 256)
+        # At minimum both should return valid specs
+        assert r1.tile_m > 0
+        assert r2.tile_m > 0
+
+    def test_m1_corner_case(self, mock_model_dir):
+        pool = self._make_pool()
+        heuristic = create_ml_heuristic(mock_model_dir, kernel_pool=pool)
+        result = heuristic(1, 4096, 4096)
+        assert isinstance(result, MLKernelSpec)
+        assert np.isfinite(result.predicted_tflops)
+
+    def test_empty_pool_raises(self, mock_model_dir):
+        with pytest.raises(ValueError, match="No kernel configs"):
+            create_ml_heuristic(mock_model_dir, kernel_pool=[])
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_evaluate.py b/dispatcher/heuristics/tests/test_evaluate.py
new file mode 100644
index 0000000000..bcbe39af9d
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_evaluate.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for evaluate.py.
+
+Covers: shape family classification, K-depth regime classification,
+and basic evaluation metric checks.
+"""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from evaluate import classify_shape_family, classify_k_regime
+
+
+class TestClassifyShapeFamily:
+    def test_tiny_m(self):
+        assert classify_shape_family(1, 4096, 4096) == "tiny_m"
+        assert classify_shape_family(16, 1536, 7168) == "tiny_m"
+
+    def test_small_m(self):
+        assert classify_shape_family(32, 1536, 7168) == "small_m"
+        assert classify_shape_family(128, 4096, 4096) == "small_m"
+
+    def test_medium_m(self):
+        assert classify_shape_family(256, 1024, 1024) == "medium_m"
+        assert classify_shape_family(2048, 2048, 2048) == "medium_m"
+
+    def test_large_m(self):
+        assert classify_shape_family(4096, 4096, 4096) == "large_m"
+        assert classify_shape_family(20480, 7168, 256) == "large_m"
+
+
+class TestClassifyKRegime:
+    def test_shallow(self):
+        assert classify_k_regime(256) == "shallow_k"
+        assert classify_k_regime(32) == "shallow_k"
+
+    def test_medium(self):
+        assert classify_k_regime(1024) == "medium_k"
+        assert classify_k_regime(2048) == "medium_k"
+
+    def test_deep(self):
+        assert classify_k_regime(4096) == "deep_k"
+        assert classify_k_regime(7168) == "deep_k"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_feature_engine.py b/dispatcher/heuristics/tests/test_feature_engine.py
new file mode 100644
index 0000000000..492623ce99
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_feature_engine.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for feature_engine.py.
+
+Covers: feature count consistency, formula correctness (tile efficiency, LDS,
+arithmetic intensity), corner-case shapes (M=1, huge M, square, skinny-K),
+parameter space validity, config validation, and batch vs single extraction parity.
+"""
+
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from feature_engine import (
+    GemmUniversalFeatureEngine,
+)
+
+
+@pytest.fixture
+def fe():
+    """Default feature engine with MI355X-like hardware."""
+    return GemmUniversalFeatureEngine(
+        num_cus=256,
+        lds_capacity=65536,
+        max_clock_mhz=2400,
+        simds_per_cu=4,
+        shader_engines=32,
+        max_waves_per_cu=32,
+        wavefront_size=64,
+        l1_cache_kb=32,
+        l2_cache_kb=4096,
+        l3_cache_kb=262144,
+        num_xcd=8,
+    )
+
+
+def _make_problem(m=1024, n=1024, k=1024, dtype="fp8", layout="rcr", split_k=1):
+    return {
+        "m": m,
+        "n": n,
+        "k": k,
+        "dtype": dtype,
+        "layout": layout,
+        "split_k": split_k,
+    }
+
+
+def _make_kernel(
+    tile_m=128,
+    tile_n=128,
+    tile_k=64,
+    warp_m=2,
+    warp_n=2,
+    warp_k=1,
+    warp_tile_m=32,
+    warp_tile_n=32,
+    warp_tile_k=16,
+    pipeline="compv3",
+    scheduler="intrawave",
+    epilogue="cshuffle",
+    pad_m=False,
+    pad_n=False,
+    pad_k=False,
+    persistent=False,
+):
+    return {
+        "tile_m": tile_m,
+        "tile_n": tile_n,
+        "tile_k": tile_k,
+        "warp_m": warp_m,
+        "warp_n": warp_n,
+        "warp_k": warp_k,
+        "warp_tile_m": warp_tile_m,
+        "warp_tile_n": warp_tile_n,
+        "warp_tile_k": warp_tile_k,
+        "pipeline": pipeline,
+        "scheduler": scheduler,
+        "epilogue": epilogue,
+        "pad_m": pad_m,
+        "pad_n": pad_n,
+        "pad_k": pad_k,
+        "persistent": persistent,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Basic consistency
+# ---------------------------------------------------------------------------
+
+
+class TestFeatureConsistency:
+    def test_feature_count_matches_names(self, fe):
+        prob = _make_problem()
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        assert len(vec) == len(fe.get_feature_names())
+
+    def test_feature_count_is_72(self, fe):
+        assert len(fe.get_feature_names()) == 72
+
+    def test_no_nan_in_output(self, fe):
+        prob = _make_problem()
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        assert not np.any(np.isnan(vec))
+
+    def test_no_inf_in_output(self, fe):
+        prob = _make_problem()
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        assert not np.any(np.isinf(vec))
+
+    def test_categorical_features_in_names(self, fe):
+        names = fe.get_feature_names()
+        for cat in fe.get_categorical_features():
+            assert cat in names
+
+
+# ---------------------------------------------------------------------------
+# Formula correctness
+# ---------------------------------------------------------------------------
+
+
+class TestTileEfficiency:
+    """Tile efficiency: fraction of the last tile that is useful work."""
+
+    def test_perfectly_divisible(self, fe):
+        prob = _make_problem(m=256, n=256, k=128)
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("tile_eff_m")] == 1.0
+        assert vec[names.index("tile_eff_n")] == 1.0
+        assert vec[names.index("tile_eff_k")] == 1.0
+        assert vec[names.index("overall_tile_efficiency")] == 1.0
+
+    def test_not_divisible(self, fe):
+        prob = _make_problem(m=100, n=100, k=100)
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("tile_eff_m")] == pytest.approx(100 / 128)
+        assert vec[names.index("tile_eff_n")] == pytest.approx(100 / 128)
+        assert vec[names.index("tile_eff_k")] == pytest.approx(36 / 64)
+
+    def test_m_equals_1(self, fe):
+        """Single-token inference: M=1, tile_m=128 => eff = 1/128."""
+        prob = _make_problem(m=1)
+        kern = _make_kernel(tile_m=128)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("tile_eff_m")] == pytest.approx(1.0 / 128.0)
+
+
+class TestLDSUsage:
+    def test_lds_formula(self, fe):
+        prob = _make_problem(dtype="fp8")
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        expected = (128 * 64 + 128 * 64) * 1.0  # fp8 = 1 byte
+        assert vec[names.index("lds_usage_estimate")] == expected
+
+    def test_lds_ratio_compv4(self, fe):
+        """compv4 has 32KB LDS limit, not 64KB."""
+        prob = _make_problem(dtype="fp8")
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64, pipeline="compv4")
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        lds_est = (128 * 64 + 128 * 64) * 1.0
+        assert vec[names.index("lds_usage_ratio")] == pytest.approx(lds_est / 32768)
+
+    def test_lds_fp16_doubles(self, fe):
+        prob = _make_problem(dtype="fp16")
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        expected = (128 * 64 + 128 * 64) * 2.0  # fp16 = 2 bytes
+        assert vec[names.index("lds_usage_estimate")] == expected
+
+
+class TestArithmeticIntensity:
+    def test_square_shape(self, fe):
+        M, N, K = 1024, 1024, 1024
+        prob = _make_problem(m=M, n=N, k=K, dtype="fp8")
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        mem = (M * K + K * N + M * N) * 1.0
+        expected = (2.0 * M * N * K) / mem
+        assert vec[names.index("arithmetic_intensity")] == pytest.approx(expected)
+
+    def test_skinny_k(self, fe):
+        """Small K => low arithmetic intensity (memory-bound)."""
+        prob = _make_problem(m=8192, n=8192, k=32, dtype="fp8")
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("arithmetic_intensity")] < 100
+
+    def test_deep_k(self, fe):
+        """Large K => high arithmetic intensity (compute-bound)."""
+        prob = _make_problem(m=256, n=256, k=8192, dtype="fp8")
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("arithmetic_intensity")] > 100
+
+
+# ---------------------------------------------------------------------------
+# Corner-case shapes
+# ---------------------------------------------------------------------------
+
+
+class TestCornerCaseShapes:
+    def test_m1_single_token(self, fe):
+        vec = fe.extract(_make_problem(m=1, n=4096, k=4096), _make_kernel())
+        assert not np.any(np.isnan(vec))
+
+    def test_m1_n1_k1_minimum(self, fe):
+        vec = fe.extract(_make_problem(m=1, n=1, k=1), _make_kernel())
+        assert not np.any(np.isnan(vec))
+        assert not np.any(np.isinf(vec))
+
+    def test_very_large_m(self, fe):
+        vec = fe.extract(_make_problem(m=20480, n=7168, k=7168), _make_kernel())
+        assert not np.any(np.isnan(vec))
+
+    def test_non_power_of_2(self, fe):
+        vec = fe.extract(_make_problem(m=1536, n=7168, k=2304), _make_kernel())
+        assert not np.any(np.isnan(vec))
+
+    def test_prime_dimensions(self, fe):
+        vec = fe.extract(_make_problem(m=17, n=31, k=127), _make_kernel())
+        assert not np.any(np.isnan(vec))
+
+    def test_tall_matrix(self, fe):
+        """M >> N (tall matrix)."""
+        prob = _make_problem(m=16384, n=64, k=1024)
+        vec = fe.extract(prob, _make_kernel())
+        names = fe.get_feature_names()
+        assert vec[names.index("aspect_ratio_mn")] > 100
+
+    def test_wide_matrix(self, fe):
+        """N >> M (wide matrix)."""
+        prob = _make_problem(m=64, n=16384, k=1024)
+        vec = fe.extract(prob, _make_kernel())
+        names = fe.get_feature_names()
+        assert vec[names.index("aspect_ratio_mn")] < 0.01
+
+
+# ---------------------------------------------------------------------------
+# Batch vs single extraction parity
+# ---------------------------------------------------------------------------
+
+
+class TestBatchParity:
+    def test_batch_matches_single(self, fe):
+        """Vectorized batch should produce identical results to row-by-row."""
+        rows = [
+            {
+                "m": 16,
+                "n": 1536,
+                "k": 7168,
+                "split_k": 1,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "tile_m": 128,
+                "tile_n": 128,
+                "tile_k": 128,
+                "warp_m": 1,
+                "warp_n": 4,
+                "warp_k": 1,
+                "warp_tile_m": 16,
+                "warp_tile_n": 16,
+                "warp_tile_k": 128,
+                "pipeline": "compv3",
+                "scheduler": "intrawave",
+                "epilogue": "cshuffle",
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+            },
+            {
+                "m": 20480,
+                "n": 7168,
+                "k": 256,
+                "split_k": 1,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "tile_m": 64,
+                "tile_n": 64,
+                "tile_k": 128,
+                "warp_m": 2,
+                "warp_n": 2,
+                "warp_k": 1,
+                "warp_tile_m": 32,
+                "warp_tile_n": 32,
+                "warp_tile_k": 16,
+                "pipeline": "mem",
+                "scheduler": "interwave",
+                "epilogue": "default",
+                "pad_m": True,
+                "pad_n": True,
+                "pad_k": True,
+                "persistent": True,
+            },
+        ]
+        df = pd.DataFrame(rows)
+        batch_result = fe.extract_batch(df)
+
+        for i, row_dict in enumerate(rows):
+            single_result = fe.extract(row_dict, row_dict)
+            np.testing.assert_allclose(
+                batch_result[i],
+                single_result,
+                rtol=1e-5,
+                atol=1e-5,
+                err_msg=f"Mismatch at row {i}",
+            )
+
+
+# ---------------------------------------------------------------------------
+# Parameter space and validation
+# ---------------------------------------------------------------------------
+
+
+class TestParameterSpace:
+    def test_parameter_space_non_empty(self, fe):
+        ps = fe.get_parameter_space()
+        assert len(ps) > 0
+        assert "tile_m" in ps
+        assert "pipeline" in ps
+
+    def test_valid_config_passes(self, fe):
+        config = {
+            "tile_m": 128,
+            "tile_n": 128,
+            "tile_k": 64,
+            "warp_m": 2,
+            "warp_n": 2,
+            "warp_k": 1,
+            "pipeline": "compv3",
+            "scheduler": "intrawave",
+            "epilogue": "cshuffle",
+            "pad_m": False,
+            "pad_n": False,
+            "pad_k": False,
+            "persistent": False,
+        }
+        assert fe.validate_config(config) is True
+
+    def test_invalid_tile_rejected(self, fe):
+        config = {"tile_m": 999}
+        assert fe.validate_config(config) is False
+
+    def test_lds_constraint_rejects_huge_tile(self, fe):
+        config = {
+            "tile_m": 256,
+            "tile_n": 256,
+            "tile_k": 256,
+            "warp_m": 2,
+            "warp_n": 2,
+            "warp_k": 1,
+            "pipeline": "compv4",
+        }
+        assert fe.validate_config(config) is False
+
+    def test_project_to_valid_snaps(self, fe):
+        config = {"tile_m": 100, "tile_n": 200, "pipeline": "compv3"}
+        projected = fe.project_to_valid(config)
+        assert projected["tile_m"] == 128
+        assert projected["tile_n"] == 192
+        assert projected["pipeline"] == "compv3"
+
+
+# ---------------------------------------------------------------------------
+# Hardware features
+# ---------------------------------------------------------------------------
+
+
+class TestHardwareFeatures:
+    def test_hardware_values_propagated(self, fe):
+        vec = fe.extract(_make_problem(), _make_kernel())
+        names = fe.get_feature_names()
+        assert vec[names.index("hw_num_cus")] == 256
+        assert vec[names.index("hw_max_clock_mhz")] == 2400
+        assert vec[names.index("hw_total_simds")] == 256 * 4
+        assert vec[names.index("hw_num_xcd")] == 8
+
+    def test_different_hardware(self):
+        fe_small = GemmUniversalFeatureEngine(num_cus=120, max_clock_mhz=1800)
+        vec = fe_small.extract(_make_problem(), _make_kernel())
+        names = fe_small.get_feature_names()
+        assert vec[names.index("hw_num_cus")] == 120
+        assert vec[names.index("hw_max_clock_mhz")] == 1800
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_feature_parity.py b/dispatcher/heuristics/tests/test_feature_parity.py
new file mode 100644
index 0000000000..43f6968b88
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_feature_parity.py
@@ -0,0 +1,357 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Test that the C++ extract_features() in ml_heuristic.hpp produces identical
+values to the Python GemmUniversalFeatureEngine.extract().
+
+This test uses ctypes to call the C++ feature extraction compiled into a
+small shared library, then compares against Python output. If compilation
+fails (no HIP/ROCm), it falls back to verifying the Python feature engine
+against manually computed expected values for specific test cases.
+"""
+
+import math
+import sys
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from feature_engine import (
+    GemmUniversalFeatureEngine,
+    PIPELINE_MAP,
+    SCHEDULER_MAP,
+    EPILOGUE_MAP,
+    LAYOUT_MAP,
+)
+
+
+def _compute_features_manually(
+    M,
+    N,
+    K,
+    split_k,
+    dtype,
+    layout,
+    tile_m,
+    tile_n,
+    tile_k,
+    warp_m,
+    warp_n,
+    warp_k,
+    warp_tile_m,
+    warp_tile_n,
+    warp_tile_k,
+    pipeline,
+    scheduler,
+    epilogue,
+    pad_m,
+    pad_n,
+    pad_k,
+    persistent,
+    hw,
+):
+    """Recompute features independently to verify the Python engine."""
+    bpe_map = {"fp8": 1.0, "fp16": 2.0, "bf16": 2.0, "fp32": 4.0}
+    bpe = bpe_map.get(dtype, 1.0)
+
+    log2_M = math.log2(max(M, 1))
+    log2_N = math.log2(max(N, 1))
+    log2_K = math.log2(max(K, 1))
+    log2_MNK = math.log2(max(M * N * K, 1))
+    mem = (M * K + K * N + M * N) * bpe
+    ai = (2.0 * M * N * K) / max(mem, 1)
+
+    lds_est = (tile_m * tile_k + tile_n * tile_k) * bpe
+    lds_cap = 32768 if pipeline == "compv4" else hw["lds_capacity"]
+
+    ntm = math.ceil(M / max(tile_m, 1))
+    ntn = math.ceil(N / max(tile_n, 1))
+    ntk = math.ceil(K / max(tile_k, 1))
+
+    def eff(d, t):
+        if t <= 0:
+            return 1.0
+        r = d % t
+        return r / t if r > 0 else 1.0
+
+    # Problem-to-tile ratios
+    ratio_M_to_tile_m = M / max(tile_m, 1)
+    ratio_N_to_tile_n = N / max(tile_n, 1)
+    ratio_K_to_tile_k = K / max(tile_k, 1)
+
+    # Binary features: problem smaller than tile
+    problem_smaller_than_tile_m = float(M < tile_m)
+    problem_smaller_than_tile_n = float(N < tile_n)
+    problem_smaller_than_tile_k = float(K < tile_k)
+    any_dim_too_small = float((M < tile_m) or (N < tile_n) or (K < tile_k))
+
+    # Padding requirement features
+    needs_padding_m = float(tile_m > 0 and M % tile_m != 0)
+    needs_padding_n = float(tile_n > 0 and N % tile_n != 0)
+    needs_padding_k = float(tile_k > 0 and K % tile_k != 0)
+
+    # Interaction features
+    has_padding_when_needed_m = float(needs_padding_m and pad_m)
+    has_padding_when_needed_n = float(needs_padding_n and pad_n)
+    has_padding_when_needed_k = float(needs_padding_k and pad_k)
+
+    # Missing padding features
+    missing_required_padding_m = float(needs_padding_m and not pad_m)
+    missing_required_padding_n = float(needs_padding_n and not pad_n)
+    missing_required_padding_k = float(needs_padding_k and not pad_k)
+    missing_any_required_padding = float(
+        missing_required_padding_m or missing_required_padding_n or missing_required_padding_k
+    )
+
+    return [
+        M,                                      # 0
+        N,                                      # 1
+        K,                                      # 2
+        split_k,                                # 3
+        log2_M,                                 # 4
+        log2_N,                                 # 5
+        log2_K,                                 # 6
+        log2_MNK,                               # 7
+        ai,                                     # 8
+        M / max(N, 1),                          # 9 (aspect_ratio_mn)
+        M / max(K, 1),                          # 10 (aspect_ratio_mk)
+        N / max(K, 1),                          # 11 (aspect_ratio_nk)
+        LAYOUT_MAP.get(layout, 0),              # 12
+        tile_m,                                 # 13
+        tile_n,                                 # 14
+        tile_k,                                 # 15
+        warp_m,                                 # 16
+        warp_n,                                 # 17
+        warp_k,                                 # 18
+        warp_tile_m,                            # 19
+        warp_tile_n,                            # 20
+        warp_tile_k,                            # 21
+        PIPELINE_MAP.get(pipeline, 0),          # 22
+        SCHEDULER_MAP.get(scheduler, 0),        # 23
+        EPILOGUE_MAP.get(epilogue, 0),          # 24
+        float(pad_m),                           # 25
+        float(pad_n),                           # 26
+        float(pad_k),                           # 27
+        float(persistent),                      # 28
+        warp_m * warp_n * warp_k,               # 29 (num_warps)
+        tile_m * tile_n * tile_k,               # 30 (tile_volume)
+        tile_m * tile_n,                        # 31 (tile_mn)
+        lds_est,                                # 32 (lds_usage_estimate)
+        lds_est / max(lds_cap, 1),              # 33 (lds_usage_ratio)
+        ntm,                                    # 34 (num_tiles_m)
+        ntn,                                    # 35 (num_tiles_n)
+        ntk,                                    # 36 (num_tiles_k)
+        ntm * ntn,                              # 37 (total_output_tiles)
+        eff(M, tile_m),                         # 38 (tile_eff_m)
+        eff(N, tile_n),                         # 39 (tile_eff_n)
+        eff(K, tile_k),                         # 40 (tile_eff_k)
+        eff(M, tile_m) * eff(N, tile_n) * eff(K, tile_k),  # 41 (overall_tile_efficiency)
+        ntm * ntn / max(hw["num_cus"], 1),      # 42 (cu_utilization)
+        ratio_M_to_tile_m,                      # 43
+        ratio_N_to_tile_n,                      # 44
+        ratio_K_to_tile_k,                      # 45
+        problem_smaller_than_tile_m,            # 46
+        problem_smaller_than_tile_n,            # 47
+        problem_smaller_than_tile_k,            # 48
+        any_dim_too_small,                      # 49
+        needs_padding_m,                        # 50
+        needs_padding_n,                        # 51
+        needs_padding_k,                        # 52
+        has_padding_when_needed_m,              # 53
+        has_padding_when_needed_n,              # 54
+        has_padding_when_needed_k,              # 55
+        missing_required_padding_m,             # 56
+        missing_required_padding_n,             # 57
+        missing_required_padding_k,             # 58
+        missing_any_required_padding,           # 59
+        hw["num_cus"],                          # 60
+        hw["simds_per_cu"],                     # 61
+        hw["num_cus"] * hw["simds_per_cu"],     # 62 (total_simds)
+        hw["shader_engines"],                   # 63
+        hw["max_clock_mhz"],                    # 64
+        hw["max_waves_per_cu"],                 # 65
+        hw["wavefront_size"],                   # 66
+        hw["lds_capacity"],                     # 67
+        hw["l1_cache_kb"],                      # 68
+        hw["l2_cache_kb"],                      # 69
+        hw["l3_cache_kb"],                      # 70
+        hw["num_xcd"],                          # 71
+    ]
+
+
+TEST_CASES = [
+    {
+        "problem": {
+            "m": 1024,
+            "n": 1024,
+            "k": 1024,
+            "split_k": 1,
+            "dtype": "fp8",
+            "layout": "rcr",
+        },
+        "kernel": {
+            "tile_m": 128,
+            "tile_n": 128,
+            "tile_k": 64,
+            "warp_m": 2,
+            "warp_n": 2,
+            "warp_k": 1,
+            "warp_tile_m": 32,
+            "warp_tile_n": 32,
+            "warp_tile_k": 16,
+            "pipeline": "compv3",
+            "scheduler": "intrawave",
+            "epilogue": "cshuffle",
+            "pad_m": False,
+            "pad_n": False,
+            "pad_k": False,
+            "persistent": False,
+        },
+    },
+    {
+        "problem": {
+            "m": 1,
+            "n": 4096,
+            "k": 4096,
+            "split_k": 1,
+            "dtype": "fp8",
+            "layout": "rcr",
+        },
+        "kernel": {
+            "tile_m": 64,
+            "tile_n": 64,
+            "tile_k": 128,
+            "warp_m": 1,
+            "warp_n": 4,
+            "warp_k": 1,
+            "warp_tile_m": 16,
+            "warp_tile_n": 16,
+            "warp_tile_k": 128,
+            "pipeline": "compv4",
+            "scheduler": "interwave",
+            "epilogue": "default",
+            "pad_m": True,
+            "pad_n": True,
+            "pad_k": True,
+            "persistent": True,
+        },
+    },
+    {
+        "problem": {
+            "m": 20480,
+            "n": 7168,
+            "k": 256,
+            "split_k": 1,
+            "dtype": "fp16",
+            "layout": "rrr",
+        },
+        "kernel": {
+            "tile_m": 256,
+            "tile_n": 256,
+            "tile_k": 32,
+            "warp_m": 4,
+            "warp_n": 1,
+            "warp_k": 1,
+            "warp_tile_m": 32,
+            "warp_tile_n": 32,
+            "warp_tile_k": 16,
+            "pipeline": "mem",
+            "scheduler": "interwave",
+            "epilogue": "cshuffle",
+            "pad_m": False,
+            "pad_n": False,
+            "pad_k": False,
+            "persistent": False,
+        },
+    },
+]
+
+HW = {
+    "num_cus": 256,
+    "simds_per_cu": 4,
+    "shader_engines": 32,
+    "max_clock_mhz": 2400,
+    "max_waves_per_cu": 32,
+    "wavefront_size": 64,
+    "lds_capacity": 65536,
+    "l1_cache_kb": 32,
+    "l2_cache_kb": 4096,
+    "l3_cache_kb": 262144,
+    "num_xcd": 8,
+}
+
+
+class TestFeatureParity:
+    """Verify Python feature engine matches manual computation (C++ uses same logic)."""
+
+    @pytest.fixture
+    def fe(self):
+        return GemmUniversalFeatureEngine(**HW)
+
+    @pytest.mark.parametrize("case_idx", range(len(TEST_CASES)))
+    def test_python_matches_manual(self, fe, case_idx):
+        case = TEST_CASES[case_idx]
+        prob = case["problem"]
+        kern = case["kernel"]
+
+        py_features = fe.extract(prob, kern)
+
+        manual = _compute_features_manually(
+            prob["m"],
+            prob["n"],
+            prob["k"],
+            prob["split_k"],
+            prob["dtype"],
+            prob["layout"],
+            kern["tile_m"],
+            kern["tile_n"],
+            kern["tile_k"],
+            kern["warp_m"],
+            kern["warp_n"],
+            kern["warp_k"],
+            kern["warp_tile_m"],
+            kern["warp_tile_n"],
+            kern["warp_tile_k"],
+            kern["pipeline"],
+            kern["scheduler"],
+            kern["epilogue"],
+            kern["pad_m"],
+            kern["pad_n"],
+            kern["pad_k"],
+            kern["persistent"],
+            HW,
+        )
+
+        manual_arr = np.array(manual, dtype=np.float64)
+        assert len(py_features) == len(manual_arr) == 72
+
+        for i in range(72):
+            assert py_features[i] == pytest.approx(
+                manual_arr[i], rel=1e-10, abs=1e-15
+            ), (
+                f"Feature {i} ({fe.get_feature_names()[i]}): Python={py_features[i]}, Manual={manual_arr[i]}"
+            )
+
+    def test_feature_count(self, fe):
+        assert len(fe.get_feature_names()) == 72
+
+    def test_encoding_maps_match_cpp(self):
+        """The C++ encode_* functions must use the same mapping as Python."""
+        assert PIPELINE_MAP == {
+            "compv3": 0,
+            "compv4": 1,
+            "compv5": 2,
+            "mem": 3,
+            "preshufflev2": 4,
+        }
+        assert SCHEDULER_MAP == {"intrawave": 0, "interwave": 1}
+        assert EPILOGUE_MAP == {"default": 0, "cshuffle": 1}
+        assert LAYOUT_MAP == {"rcr": 0, "rrr": 1, "crr": 2, "ccr": 3}
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_model_compression.py b/dispatcher/heuristics/tests/test_model_compression.py
new file mode 100644
index 0000000000..50727f1242
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_model_compression.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""Test that compressed models can be loaded and used."""
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from predict import Predictor
+
+
+def test_fp16_model_decompression():
+    """Test that fp16 model is auto-decompressed and usable."""
+    model_dir = Path(__file__).parent.parent / "models" / "gemm_universal_fp16_gfx950"
+
+    # Ensure .lgbm.gz exists
+    gz_file = model_dir / "model_tflops.lgbm.gz"
+
+    assert gz_file.exists(), f"Compressed model not found: {gz_file}"
+
+    # Load predictor - should auto-decompress
+    predictor = Predictor(model_dir)
+
+    # Test prediction
+    problem = {"m": 128, "n": 1536, "k": 7168, "dtype": "fp16", "layout": "rcr"}
+    kernel_config = {
+        "tile_shape": {"m0": 128, "n0": 128, "k0": 16},
+        "wave_shape": {"m1": 2, "n1": 2, "k1": 1},
+        "warp_tile": {"m2": 32, "n2": 32, "k2": 8},
+    }
+
+    tflops = predictor.predict_tflops(problem, kernel_config)
+
+    assert isinstance(tflops, float), f"Expected float, got {type(tflops)}"
+    assert tflops > 0, f"Expected positive TFLOPS, got {tflops}"
+
+    # Verify decompressed file was created
+    lgbm_file = model_dir / "model_tflops.lgbm"
+    assert lgbm_file.exists(), "Model should have been decompressed"
+
+    print(f"✅ FP16 model decompression test passed")
+    print(f"   Predicted TFLOPS: {tflops:.2f}")
+    print(f"   Decompressed to: {lgbm_file}")
+    return True
+
+
+def test_fp8_model_decompression():
+    """Test that fp8 model is auto-decompressed and usable."""
+    model_dir = Path(__file__).parent.parent / "models" / "gemm_universal_fp8_gfx950"
+
+    # Ensure .lgbm.gz exists
+    gz_file = model_dir / "model_tflops.lgbm.gz"
+
+    assert gz_file.exists(), f"Compressed model not found: {gz_file}"
+
+    # Load predictor - should auto-decompress
+    predictor = Predictor(model_dir)
+
+    # Test prediction
+    problem = {"m": 2048, "n": 2048, "k": 2048, "dtype": "fp8", "layout": "rcr"}
+    kernel_config = {
+        "tile_shape": {"m0": 256, "n0": 256, "k0": 64},
+        "wave_shape": {"m1": 2, "n1": 2, "k1": 1},
+        "warp_tile": {"m2": 32, "n2": 32, "k2": 16},
+    }
+
+    tflops = predictor.predict_tflops(problem, kernel_config)
+
+    assert isinstance(tflops, float), f"Expected float, got {type(tflops)}"
+    assert tflops > 0, f"Expected positive TFLOPS, got {tflops}"
+
+    # Verify decompressed file was created
+    lgbm_file = model_dir / "model_tflops.lgbm"
+    assert lgbm_file.exists(), "Model should have been decompressed"
+
+    print(f"✅ FP8 model decompression test passed")
+    print(f"   Predicted TFLOPS: {tflops:.2f}")
+    print(f"   Decompressed to: {lgbm_file}")
+    return True
+
+
+if __name__ == "__main__":
+    print("Testing compressed model auto-decompression...")
+    print()
+
+    test_fp16_model_decompression()
+    print()
+    test_fp8_model_decompression()
+    print()
+    print("✅ All model compression tests passed!")
diff --git a/dispatcher/heuristics/tests/test_predict.py b/dispatcher/heuristics/tests/test_predict.py
new file mode 100644
index 0000000000..24cb26c4fa
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_predict.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for predict.py.
+
+Covers: Predictor initialization, single prediction, ranking, select_best,
+missing model handling, and edge cases (single kernel, empty list).
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from feature_engine import GemmUniversalFeatureEngine
+from predict import Predictor
+
+
+@pytest.fixture
+def model_dir(tmp_path):
+    """Create a minimal trained model for testing."""
+    fe = GemmUniversalFeatureEngine()
+    n_features = len(fe.get_feature_names())
+
+    np.random.seed(42)
+    X = np.random.rand(200, n_features)
+    y = np.random.rand(200) * 100
+
+    model = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
+    model.fit(X, y)
+    model.booster_.save_model(str(tmp_path / "model_tflops.lgbm"))
+
+    y_lat = np.random.rand(200) * 0.1
+    model_lat = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
+    model_lat.fit(X, y_lat)
+    model_lat.booster_.save_model(str(tmp_path / "model_latency.lgbm"))
+
+    spec = {
+        "feature_names": fe.get_feature_names(),
+        "categorical_features": fe.get_categorical_features(),
+    }
+    with open(tmp_path / "feature_spec.json", "w") as f:
+        json.dump(spec, f)
+
+    return tmp_path
+
+
+@pytest.fixture
+def predictor(model_dir):
+    return Predictor(model_dir)
+
+
+def _problem():
+    return {
+        "m": 1024,
+        "n": 1024,
+        "k": 1024,
+        "dtype": "fp8",
+        "layout": "rcr",
+        "split_k": 1,
+    }
+
+
+def _kernel(tile_m=128, pipeline="compv3"):
+    return {
+        "kernel_name": f"test_kernel_{tile_m}_{pipeline}",
+        "tile_m": tile_m,
+        "tile_n": 128,
+        "tile_k": 64,
+        "warp_m": 2,
+        "warp_n": 2,
+        "warp_k": 1,
+        "warp_tile_m": 32,
+        "warp_tile_n": 32,
+        "warp_tile_k": 16,
+        "pipeline": pipeline,
+        "scheduler": "intrawave",
+        "epilogue": "cshuffle",
+        "pad_m": False,
+        "pad_n": False,
+        "pad_k": False,
+        "persistent": False,
+    }
+
+
+class TestPredictor:
+    def test_predict_tflops_returns_float(self, predictor):
+        result = predictor.predict_tflops(_problem(), _kernel())
+        assert isinstance(result, float)
+
+    def test_predict_latency_returns_float(self, predictor):
+        result = predictor.predict_latency(_problem(), _kernel())
+        assert isinstance(result, float)
+
+    def test_predict_all_returns_dict(self, predictor):
+        result = predictor.predict_all(_problem(), _kernel())
+        assert "tflops" in result
+        assert "latency_ms" in result
+
+    def test_rank_kernels_sorted_descending(self, predictor):
+        kernels = [_kernel(64, "compv3"), _kernel(128, "compv4"), _kernel(256, "mem")]
+        ranked = predictor.rank_kernels(_problem(), kernels)
+        assert len(ranked) == 3
+        scores = [s for _, s in ranked]
+        assert scores == sorted(scores, reverse=True)
+
+    def test_select_best_returns_name(self, predictor):
+        kernels = [_kernel(64), _kernel(128)]
+        best = predictor.select_best(_problem(), kernels)
+        assert isinstance(best, str)
+        assert best in [k["kernel_name"] for k in kernels]
+
+    def test_single_kernel(self, predictor):
+        kernels = [_kernel(128)]
+        ranked = predictor.rank_kernels(_problem(), kernels)
+        assert len(ranked) == 1
+
+    def test_missing_bandwidth_model(self, model_dir):
+        pred = Predictor(model_dir)
+        with pytest.raises(FileNotFoundError):
+            pred.predict_bandwidth(_problem(), _kernel())
+
+    def test_empty_kernel_list(self, predictor):
+        with pytest.raises(ValueError):
+            predictor.select_best(_problem(), [])
+
+    def test_corner_case_m1(self, predictor):
+        prob = {
+            "m": 1,
+            "n": 4096,
+            "k": 4096,
+            "dtype": "fp8",
+            "layout": "rcr",
+            "split_k": 1,
+        }
+        result = predictor.predict_tflops(prob, _kernel())
+        assert np.isfinite(result)
+
+    def test_different_shapes_give_different_results(self, predictor):
+        k = _kernel()
+        r1 = predictor.predict_tflops(
+            {
+                "m": 16,
+                "n": 1536,
+                "k": 7168,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            k,
+        )
+        r2 = predictor.predict_tflops(
+            {
+                "m": 20480,
+                "n": 7168,
+                "k": 256,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            k,
+        )
+        assert r1 != r2
+
+
+class TestPredictorEdgeCases:
+    def test_nonexistent_model_dir(self):
+        with pytest.raises(Exception):
+            pred = Predictor("/nonexistent/path")
+            pred.predict_tflops(_problem(), _kernel())
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_search.py b/dispatcher/heuristics/tests/test_search.py
new file mode 100644
index 0000000000..b1d1ac79b3
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_search.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for search.py.
+
+Covers: random search, DE search, config validity, result ordering,
+budget compliance, and edge cases.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from feature_engine import GemmUniversalFeatureEngine
+from predict import Predictor
+from search import SurrogateSearch
+
+
+@pytest.fixture
+def model_dir(tmp_path):
+    """Create a minimal trained model."""
+    fe = GemmUniversalFeatureEngine()
+    n_features = len(fe.get_feature_names())
+    np.random.seed(42)
+    X = np.random.rand(200, n_features)
+    y = np.random.rand(200) * 500
+    model = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
+    model.fit(X, y)
+    model.booster_.save_model(str(tmp_path / "model_tflops.lgbm"))
+    spec = {
+        "feature_names": fe.get_feature_names(),
+        "categorical_features": fe.get_categorical_features(),
+    }
+    with open(tmp_path / "feature_spec.json", "w") as f:
+        json.dump(spec, f)
+    return tmp_path
+
+
+@pytest.fixture
+def predictor(model_dir):
+    return Predictor(model_dir)
+
+
+def _problem():
+    return {
+        "m": 1024,
+        "n": 1024,
+        "k": 1024,
+        "dtype": "fp8",
+        "layout": "rcr",
+        "split_k": 1,
+    }
+
+
+class TestRandomSearch:
+    def test_returns_results(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(_problem(), budget=50, top_k=5)
+        assert len(results) > 0
+        assert len(results) <= 5
+
+    def test_results_sorted_descending(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(_problem(), budget=100, top_k=10)
+        scores = [s for _, s in results]
+        assert scores == sorted(scores, reverse=True)
+
+    def test_configs_are_valid(self, predictor):
+        fe = GemmUniversalFeatureEngine()
+        searcher = SurrogateSearch(predictor, feature_engine=fe, strategy="random")
+        results = searcher.search(_problem(), budget=50, top_k=5)
+        for cfg, _ in results:
+            ps = fe.get_parameter_space()
+            for k, v in cfg.items():
+                if k in ps:
+                    assert v in ps[k], f"{k}={v} not in {ps[k]}"
+
+    def test_respects_top_k(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(_problem(), budget=100, top_k=3)
+        assert len(results) <= 3
+
+    def test_different_problems_produce_results(self, predictor):
+        """Both problem sizes should produce valid search results."""
+        searcher = SurrogateSearch(predictor, strategy="random", seed=42)
+        r1 = searcher.search(
+            {
+                "m": 16,
+                "n": 1536,
+                "k": 7168,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            budget=50,
+            top_k=3,
+        )
+        searcher2 = SurrogateSearch(predictor, strategy="random", seed=42)
+        r2 = searcher2.search(
+            {
+                "m": 20480,
+                "n": 7168,
+                "k": 256,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            budget=50,
+            top_k=3,
+        )
+        assert len(r1) > 0
+        assert len(r2) > 0
+        for _, score in r1 + r2:
+            assert np.isfinite(score)
+
+    def test_m1_corner_case(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(
+            {
+                "m": 1,
+                "n": 4096,
+                "k": 4096,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            budget=50,
+            top_k=5,
+        )
+        assert len(results) > 0
+        for _, score in results:
+            assert np.isfinite(score)
+
+
+class TestDESearch:
+    def test_returns_results(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="de")
+        results = searcher.search(_problem(), budget=100, top_k=5)
+        assert len(results) > 0
+
+    def test_results_sorted_descending(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="de")
+        results = searcher.search(_problem(), budget=100, top_k=5)
+        scores = [s for _, s in results]
+        assert scores == sorted(scores, reverse=True)
+
+    def test_de_improves_over_initial(self, predictor):
+        """DE should generally find at least as good as random initialization."""
+        searcher_r = SurrogateSearch(predictor, strategy="random", seed=42)
+        r_results = searcher_r.search(_problem(), budget=100, top_k=1)
+        searcher_d = SurrogateSearch(predictor, strategy="de", seed=42)
+        d_results = searcher_d.search(_problem(), budget=100, top_k=1)
+        if r_results and d_results:
+            assert d_results[0][1] >= r_results[0][1] * 0.9
+
+    def test_small_budget(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="de")
+        results = searcher.search(_problem(), budget=30, top_k=5)
+        assert len(results) > 0
+
+
+class TestSearchEdgeCases:
+    def test_unknown_strategy_raises(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="unknown")
+        with pytest.raises(ValueError):
+            searcher.search(_problem(), budget=10)
+
+    def test_zero_budget(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(_problem(), budget=0, top_k=5)
+        assert len(results) == 0
+
+    def test_deterministic_with_same_seed(self, predictor):
+        s1 = SurrogateSearch(predictor, strategy="random", seed=123)
+        s2 = SurrogateSearch(predictor, strategy="random", seed=123)
+        r1 = s1.search(_problem(), budget=50, top_k=5)
+        r2 = s2.search(_problem(), budget=50, top_k=5)
+        assert len(r1) == len(r2)
+        for (c1, s1_), (c2, s2_) in zip(r1, r2):
+            assert s1_ == pytest.approx(s2_)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_train.py b/dispatcher/heuristics/tests/test_train.py
new file mode 100644
index 0000000000..d437030bfa
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_train.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for train.py.
+
+Covers: group key computation, TFLOPS efficiency calculation, edge cases
+(single group, all-invalid data, tied predictions), and warm-start
+incremental training (feature compat, lineage, quality).
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from feature_engine import GemmUniversalFeatureEngine
+from train import (
+    compute_group_keys,
+    compute_tflops_efficiency,
+    check_feature_compatibility,
+    load_warm_start_model,
+    train_final_model,
+    DEFAULT_PARAMS,
+)
+
+
+class TestComputeGroupKeys:
+    def test_basic(self):
+        df = pd.DataFrame(
+            {"m": [16, 16, 32], "n": [1536, 1536, 1536], "k": [7168, 7168, 7168]}
+        )
+        keys = compute_group_keys(df)
+        assert keys[0] == keys[1]
+        assert keys[0] != keys[2]
+
+    def test_unique_shapes(self):
+        df = pd.DataFrame({"m": [1, 2, 3], "n": [4, 5, 6], "k": [7, 8, 9]})
+        keys = compute_group_keys(df)
+        assert len(set(keys)) == 3
+
+
+class TestComputeTflopsEfficiency:
+    def test_perfect_prediction(self):
+        """Model predicts highest TFLOPS kernel => efficiency = 1.0."""
+        df = pd.DataFrame(
+            {
+                "m": [1024, 1024, 1024],
+                "n": [1024, 1024, 1024],
+                "k": [1024, 1024, 1024],
+                "measured_tflops": [100, 200, 150],
+                "pred_tflops": [50, 300, 100],  # correctly ranks kernel 1 highest
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 1
+        assert eff["efficiency"].iloc[0] == pytest.approx(1.0)
+
+    def test_worst_prediction(self):
+        """Model picks the worst kernel."""
+        df = pd.DataFrame(
+            {
+                "m": [1024, 1024, 1024],
+                "n": [1024, 1024, 1024],
+                "k": [1024, 1024, 1024],
+                "measured_tflops": [100, 200, 150],
+                "pred_tflops": [999, 1, 1],  # incorrectly ranks kernel 0 highest
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert eff["efficiency"].iloc[0] == pytest.approx(100 / 200)
+
+    def test_multiple_shapes(self):
+        df = pd.DataFrame(
+            {
+                "m": [16, 16, 32, 32],
+                "n": [1536, 1536, 1536, 1536],
+                "k": [7168, 7168, 7168, 7168],
+                "measured_tflops": [10, 20, 100, 200],
+                "pred_tflops": [5, 25, 150, 190],
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 2
+        assert eff.iloc[0]["efficiency"] == pytest.approx(1.0)
+        assert eff.iloc[1]["efficiency"] == pytest.approx(1.0)
+
+    def test_zero_tflops_shape_skipped(self):
+        df = pd.DataFrame(
+            {
+                "m": [16, 16],
+                "n": [16, 16],
+                "k": [16, 16],
+                "measured_tflops": [0, 0],
+                "pred_tflops": [1, 2],
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 0
+
+    def test_single_kernel_per_shape(self):
+        df = pd.DataFrame(
+            {
+                "m": [1024],
+                "n": [1024],
+                "k": [1024],
+                "measured_tflops": [150],
+                "pred_tflops": [100],
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 1
+        assert eff["efficiency"].iloc[0] == pytest.approx(1.0)
+
+    def test_tied_predictions(self):
+        """When multiple kernels have the same predicted TFLOPS, pandas idxmax picks the first."""
+        df = pd.DataFrame(
+            {
+                "m": [1024, 1024, 1024],
+                "n": [1024, 1024, 1024],
+                "k": [1024, 1024, 1024],
+                "measured_tflops": [100, 200, 200],
+                "pred_tflops": [50, 50, 50],
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 1
+        assert eff["efficiency"].iloc[0] >= 0.5
+
+
+# ---------------------------------------------------------------------------
+# Helpers for warm-start tests
+# ---------------------------------------------------------------------------
+
+
+def _make_dummy_data(n_rows=200, n_shapes=5):
+    """Create a small synthetic benchmark DataFrame for testing training."""
+    rng = np.random.RandomState(42)
+    rows = []
+    for _ in range(n_rows):
+        m = rng.choice([64, 128, 256, 512, 1024])
+        n = rng.choice([64, 128, 256, 512, 1024])
+        k = rng.choice([64, 128, 256, 512, 1024])
+        rows.append(
+            {
+                "m": m,
+                "n": n,
+                "k": k,
+                "split_k": 1,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "op_type": "gemm_universal",
+                "tile_m": rng.choice([64, 128, 256]),
+                "tile_n": rng.choice([64, 128, 256]),
+                "tile_k": rng.choice([32, 64, 128]),
+                "warp_m": rng.choice([1, 2, 4]),
+                "warp_n": rng.choice([1, 2, 4]),
+                "warp_k": 1,
+                "warp_tile_m": 32,
+                "warp_tile_n": 32,
+                "warp_tile_k": 16,
+                "pipeline": rng.choice(["compv3", "compv4", "mem"]),
+                "scheduler": rng.choice(["intrawave", "interwave"]),
+                "epilogue": "cshuffle",
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+                "measured_tflops": float(rng.uniform(10, 500)),
+                "latency_ms": float(rng.uniform(0.01, 1.0)),
+                "bandwidth_gb_s": float(rng.uniform(50, 1500)),
+                "is_valid": True,
+                "kernel_name": f"test_kernel_{rng.randint(0, 100)}",
+            }
+        )
+    return pd.DataFrame(rows)
+
+
+def _save_feature_spec(model_dir, fe):
+    """Save a feature_spec.json matching the given feature engine."""
+    spec = {
+        "feature_names": fe.get_feature_names(),
+        "categorical_features": fe.get_categorical_features(),
+    }
+    with open(model_dir / "feature_spec.json", "w") as f:
+        json.dump(spec, f)
+
+
+def _train_and_save_base_model(model_dir, df, fe, target="tflops"):
+    """Train a small base model and save it to model_dir."""
+    params = dict(DEFAULT_PARAMS)
+    params["n_estimators"] = 20
+    params["n_jobs"] = 1
+    model = train_final_model(df, fe, target, params)
+    model.booster_.save_model(str(model_dir / f"model_{target}.lgbm"))
+    _save_feature_spec(model_dir, fe)
+    return model
+
+
+# ---------------------------------------------------------------------------
+# Warm-start tests
+# ---------------------------------------------------------------------------
+
+
+class TestCheckFeatureCompatibility:
+    def test_compatible_passes(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        _save_feature_spec(tmp_path, fe)
+        check_feature_compatibility(tmp_path, fe)
+
+    def test_missing_spec_raises(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        with pytest.raises(FileNotFoundError, match="feature_spec.json"):
+            check_feature_compatibility(tmp_path, fe)
+
+    def test_added_feature_raises(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        spec = {
+            "feature_names": fe.get_feature_names()[:-1],
+            "categorical_features": fe.get_categorical_features(),
+        }
+        with open(tmp_path / "feature_spec.json", "w") as f:
+            json.dump(spec, f)
+        with pytest.raises(ValueError, match="Feature schema mismatch"):
+            check_feature_compatibility(tmp_path, fe)
+
+    def test_removed_feature_raises(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        spec = {
+            "feature_names": fe.get_feature_names() + ["extra_feature"],
+            "categorical_features": fe.get_categorical_features(),
+        }
+        with open(tmp_path / "feature_spec.json", "w") as f:
+            json.dump(spec, f)
+        with pytest.raises(ValueError, match="Feature schema mismatch"):
+            check_feature_compatibility(tmp_path, fe)
+
+    def test_categorical_mismatch_raises(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        spec = {
+            "feature_names": fe.get_feature_names(),
+            "categorical_features": ["layout", "pipeline"],
+        }
+        with open(tmp_path / "feature_spec.json", "w") as f:
+            json.dump(spec, f)
+        with pytest.raises(ValueError, match="Categorical feature mismatch"):
+            check_feature_compatibility(tmp_path, fe)
+
+
+class TestLoadWarmStartModel:
+    def test_loads_existing_model(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        df = _make_dummy_data()
+        _train_and_save_base_model(tmp_path, df, fe)
+        path = load_warm_start_model(tmp_path, "tflops")
+        assert path is not None
+        assert Path(path).exists()
+
+    def test_returns_none_for_missing_target(self, tmp_path):
+        assert load_warm_start_model(tmp_path, "tflops") is None
+
+    def test_returns_none_for_wrong_target(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        df = _make_dummy_data()
+        _train_and_save_base_model(tmp_path, df, fe, target="tflops")
+        assert load_warm_start_model(tmp_path, "bandwidth") is None
+
+
+class TestWarmStartTraining:
+    def test_warm_start_produces_more_trees(self, tmp_path):
+        """A warm-started model should have more trees than the base."""
+        fe = GemmUniversalFeatureEngine()
+        df = _make_dummy_data(n_rows=300)
+
+        base_dir = tmp_path / "base"
+        base_dir.mkdir()
+        base_model = _train_and_save_base_model(base_dir, df, fe)
+        base_n_trees = base_model.booster_.num_trees()
+
+        init_model_path = load_warm_start_model(base_dir, "tflops")
+        params = dict(DEFAULT_PARAMS)
+        params["n_estimators"] = 15
+        params["n_jobs"] = 1
+        warm_model = train_final_model(
+            df, fe, "tflops", params, init_model=init_model_path
+        )
+        warm_n_trees = warm_model.booster_.num_trees()
+
+        assert warm_n_trees > base_n_trees
+
+    def test_warm_start_does_not_degrade(self, tmp_path):
+        """Warm-started model on the same data should not be significantly worse."""
+        fe = GemmUniversalFeatureEngine()
+        df = _make_dummy_data(n_rows=300)
+
+        base_dir = tmp_path / "base"
+        base_dir.mkdir()
+        base_model = _train_and_save_base_model(base_dir, df, fe)
+
+        X = fe.extract_batch(df[df["is_valid"]].reset_index(drop=True))
+        y = df[df["is_valid"]]["measured_tflops"].values
+        base_rmse = np.sqrt(np.mean((base_model.predict(X) - y) ** 2))
+
+        init_model_path = load_warm_start_model(base_dir, "tflops")
+        params = dict(DEFAULT_PARAMS)
+        params["n_estimators"] = 15
+        params["n_jobs"] = 1
+        warm_model = train_final_model(
+            df, fe, "tflops", params, init_model=init_model_path
+        )
+        warm_rmse = np.sqrt(np.mean((warm_model.predict(X) - y) ** 2))
+
+        assert warm_rmse <= base_rmse * 1.1
+
+    def test_warm_start_from_nonexistent_dir(self):
+        with pytest.raises(FileNotFoundError):
+            check_feature_compatibility(
+                Path("/nonexistent/model/dir"), GemmUniversalFeatureEngine()
+            )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/train.py b/dispatcher/heuristics/train.py
new file mode 100644
index 0000000000..6d5dc772ac
--- /dev/null
+++ b/dispatcher/heuristics/train.py
@@ -0,0 +1,555 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Training script for CK Tile kernel performance prediction.
+
+Trains LGBMRegressor models (TFLOPS, latency, bandwidth) with:
+  - Log-space regression (log1p transform) for scale-invariant accuracy
+  - GroupKFold cross-validation (group key = (M, N, K))
+  - Iterative Hard Example Mining (IHEM)
+  - Model complexity bounds for C++ deployability
+  - Optional Optuna hyperparameter tuning
+  - Warm-start incremental training from a previous model via --warm_start
+
+Log-transform rationale:
+  GEMM TFLOPS spans 5 orders of magnitude (0.02 for M=1 to 2230 for large
+  shapes). Raw regression optimizes for absolute RMSE, which means the model
+  spends all its capacity predicting large shapes accurately and ignores tiny
+  shapes where TFLOPS is < 10. Training on log1p(TFLOPS) puts all shapes on
+  equal footing, improving tiny_m efficiency from 84% to 96%.
+"""
+
+import argparse
+import json
+import time
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import GroupKFold
+
+from data_pipeline import build_training_dataset
+from feature_engine import GemmUniversalFeatureEngine
+
+
+TARGET_COLUMNS = {
+    "tflops": "measured_tflops",
+    "latency": "latency_ms",
+    "bandwidth": "bandwidth_gb_s",
+}
+
+# Targets where log1p transform is applied by default.
+# TFLOPS and bandwidth span orders of magnitude; latency is already small-scale.
+LOG_TARGETS = {"tflops", "bandwidth"}
+
+DEFAULT_PARAMS = {
+    "objective": "regression",
+    "metric": ["rmse", "mae"],
+    "num_leaves": 255,
+    "max_depth": 15,
+    "n_estimators": 2000,
+    "learning_rate": 0.02,
+    "min_child_samples": 10,
+    "subsample": 0.85,
+    "colsample_bytree": 0.85,
+    "reg_alpha": 0.05,
+    "reg_lambda": 0.5,
+    "verbose": -1,
+    "n_jobs": 8,
+    "seed": 42,
+}
+
+MAX_ESTIMATORS = 5000
+WARM_START_N_ESTIMATORS = 500
+
+
+def check_feature_compatibility(
+    prev_model_dir: Path,
+    feature_engine: GemmUniversalFeatureEngine,
+) -> None:
+    """Verify that the previous model's feature spec matches the current engine.
+
+    Raises ValueError with a detailed message on mismatch. This prevents silent
+    corruption when warm-starting from a model trained with a different feature
+    schema (e.g., after adding a new feature or changing an encoding).
+    """
+    spec_path = prev_model_dir / "feature_spec.json"
+    if not spec_path.exists():
+        raise FileNotFoundError(
+            f"No feature_spec.json in {prev_model_dir}. "
+            "Cannot verify feature compatibility for warm start."
+        )
+
+    with open(spec_path) as f:
+        prev_spec = json.load(f)
+
+    prev_names = prev_spec.get("feature_names", [])
+    curr_names = feature_engine.get_feature_names()
+    if prev_names != curr_names:
+        added = set(curr_names) - set(prev_names)
+        removed = set(prev_names) - set(curr_names)
+        parts = ["Feature schema mismatch between previous model and current engine."]
+        if added:
+            parts.append(f"  Added features: {sorted(added)}")
+        if removed:
+            parts.append(f"  Removed features: {sorted(removed)}")
+        if not added and not removed:
+            parts.append("  Feature order changed (names match but order differs).")
+        raise ValueError("\n".join(parts))
+
+    prev_cats = prev_spec.get("categorical_features", [])
+    curr_cats = feature_engine.get_categorical_features()
+    if sorted(prev_cats) != sorted(curr_cats):
+        raise ValueError(
+            f"Categorical feature mismatch.\n"
+            f"  Previous: {sorted(prev_cats)}\n"
+            f"  Current:  {sorted(curr_cats)}"
+        )
+
+
+def load_warm_start_model(prev_model_dir: Path, target: str) -> str | None:
+    """Load the path to a previous model file for warm-start, or None if absent.
+
+    Automatically decompresses .lgbm.gz files if the .lgbm file doesn't exist.
+    The decompressed file is cached to disk for subsequent loads.
+
+    Returns the string path (what LightGBM's init_model expects) rather than
+    a loaded Booster, because LGBMRegressor.fit(init_model=...) accepts both
+    path strings and Booster objects and path strings avoid keeping the old
+    model in memory.
+    """
+    import gzip
+
+    model_path = prev_model_dir / f"model_{target}.lgbm"
+    gz_path = prev_model_dir / f"model_{target}.lgbm.gz"
+
+    # Auto-decompress if needed
+    if not model_path.exists() and gz_path.exists():
+        print(f"  Decompressing {gz_path.name}...")
+        with gzip.open(gz_path, "rb") as f_in:
+            with open(model_path, "wb") as f_out:
+                f_out.write(f_in.read())
+
+    if not model_path.exists():
+        return None
+    return str(model_path)
+
+
+def compute_group_keys(df: pd.DataFrame) -> np.ndarray:
+    """Create GroupKFold group keys from (M, N, K)."""
+    return (
+        df["m"].astype(str) + "_" + df["n"].astype(str) + "_" + df["k"].astype(str)
+    ).values
+
+
+def compute_tflops_efficiency(
+    df: pd.DataFrame, pred_col: str = "pred_tflops"
+) -> pd.DataFrame:
+    """Compute per-shape efficiency: predicted-best TFLOPS / oracle-best TFLOPS."""
+    results = []
+    for (m, n, k), group in df.groupby(["m", "n", "k"]):
+        oracle_best = group["measured_tflops"].max()
+        if oracle_best <= 0:
+            continue
+        pred_best_idx = group[pred_col].idxmax()
+        selected_tflops = group.loc[pred_best_idx, "measured_tflops"]
+        efficiency = selected_tflops / oracle_best
+        results.append(
+            {
+                "m": m,
+                "n": n,
+                "k": k,
+                "oracle_best_tflops": oracle_best,
+                "selected_tflops": selected_tflops,
+                "efficiency": efficiency,
+            }
+        )
+    return pd.DataFrame(results)
+
+
+def train_single_target(
+    X_train,
+    y_train,
+    X_val,
+    y_val,
+    params: dict,
+    categorical_features: list[str],
+    feature_names: list[str],
+    init_model=None,
+) -> lgb.LGBMRegressor:
+    """Train a single LGBMRegressor with early stopping.
+
+    Parameters
+    ----------
+    init_model : str, Path, lgb.Booster, lgb.LGBMModel, or None
+        If provided, training continues from this model (warm start).
+        Accepts a file path to a .lgbm file, a Booster instance, or an
+        LGBMModel instance. The new model adds n_estimators trees on top
+        of the existing ones.
+    """
+    cat_indices = [
+        feature_names.index(c) for c in categorical_features if c in feature_names
+    ]
+
+    model = lgb.LGBMRegressor(**params)
+    model.fit(
+        X_train,
+        y_train,
+        eval_set=[(X_val, y_val)],
+        eval_metric=["rmse"],
+        callbacks=[
+            lgb.early_stopping(50, verbose=False),
+            lgb.log_evaluation(0),
+        ],
+        categorical_feature=cat_indices if cat_indices else "auto",
+        init_model=init_model,
+    )
+    return model
+
+
+def run_cv(
+    df: pd.DataFrame,
+    feature_engine: GemmUniversalFeatureEngine,
+    target: str,
+    params: dict,
+    n_splits: int = 5,
+    use_log: bool = True,
+) -> dict:
+    """Run GroupKFold cross-validation and return OOF predictions + metrics.
+
+    Parameters
+    ----------
+    use_log : bool
+        If True and target is in LOG_TARGETS, train on log1p(y) and invert
+        predictions with expm1 for efficiency calculation. This normalizes
+        the scale so that tiny-M shapes (TFLOPS ~ 1) get equal attention
+        as large-M shapes (TFLOPS ~ 2000).
+    """
+    target_col = TARGET_COLUMNS[target]
+    valid_mask = df["is_valid"].fillna(False) & (df[target_col] > 0)
+    df_valid = df[valid_mask].reset_index(drop=True)
+
+    apply_log = use_log and target in LOG_TARGETS
+
+    print(
+        f"  Training on {len(df_valid)} valid rows for target={target}"
+        f"{' (log-space)' if apply_log else ''}"
+    )
+
+    X = feature_engine.extract_batch(df_valid)
+    y_raw = df_valid[target_col].values
+    y = np.log1p(y_raw) if apply_log else y_raw
+    groups = compute_group_keys(df_valid)
+    feature_names = feature_engine.get_feature_names()
+    cat_features = feature_engine.get_categorical_features()
+
+    unique_groups = np.unique(groups)
+    actual_splits = min(n_splits, len(unique_groups))
+    if actual_splits < 2:
+        print(f"  WARNING: Only {len(unique_groups)} unique groups, skipping CV")
+        return {}
+
+    gkf = GroupKFold(n_splits=actual_splits)
+    oof_preds = np.zeros(len(df_valid))
+    fold_metrics = []
+
+    for fold_idx, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
+        X_tr, X_val = X[train_idx], X[val_idx]
+        y_tr, y_val = y[train_idx], y[val_idx]
+
+        model = train_single_target(
+            X_tr, y_tr, X_val, y_val, params, cat_features, feature_names
+        )
+        preds = model.predict(X_val)
+        oof_preds[val_idx] = preds
+
+        rmse = np.sqrt(np.mean((preds - y_val) ** 2))
+        r2 = 1 - np.sum((preds - y_val) ** 2) / max(
+            np.sum((y_val - y_val.mean()) ** 2), 1e-10
+        )
+
+        if target == "tflops":
+            val_df = df_valid.iloc[val_idx].copy()
+            preds_raw = np.expm1(preds) if apply_log else preds
+            val_df["pred_tflops"] = preds_raw
+            eff_df = compute_tflops_efficiency(val_df)
+            mean_eff = eff_df["efficiency"].mean() if len(eff_df) > 0 else 0
+            p10_eff = eff_df["efficiency"].quantile(0.1) if len(eff_df) > 0 else 0
+        else:
+            mean_eff, p10_eff = None, None
+
+        fold_metrics.append(
+            {
+                "fold": fold_idx,
+                "rmse": rmse,
+                "r2": r2,
+                "mean_efficiency": mean_eff,
+                "p10_efficiency": p10_eff,
+                "train_size": len(train_idx),
+                "val_size": len(val_idx),
+                "val_groups": len(np.unique(groups[val_idx])),
+            }
+        )
+
+        eff_str = (
+            f", eff={mean_eff:.4f}, p10={p10_eff:.4f}" if mean_eff is not None else ""
+        )
+        print(f"    Fold {fold_idx}: RMSE={rmse:.4f}, R2={r2:.4f}{eff_str}")
+
+    df_valid[f"oof_pred_{target}"] = oof_preds
+
+    return {
+        "fold_metrics": fold_metrics,
+        "oof_df": df_valid,
+        "feature_names": feature_names,
+        "log_transform": apply_log,
+    }
+
+
+def train_final_model(
+    df: pd.DataFrame,
+    feature_engine: GemmUniversalFeatureEngine,
+    target: str,
+    params: dict,
+    init_model=None,
+    use_log: bool = True,
+) -> lgb.LGBMRegressor:
+    """Train the final model on all valid data.
+
+    Parameters
+    ----------
+    init_model : str, Path, lgb.Booster, lgb.LGBMModel, or None
+        If provided, training continues from this model (warm start).
+    use_log : bool
+        If True and target is in LOG_TARGETS, train on log1p(y).
+        The saved model then predicts in log-space; callers must apply
+        expm1() to get raw values.
+    """
+    target_col = TARGET_COLUMNS[target]
+    valid_mask = df["is_valid"].fillna(False) & (df[target_col] > 0)
+    df_valid = df[valid_mask].reset_index(drop=True)
+
+    apply_log = use_log and target in LOG_TARGETS
+
+    X = feature_engine.extract_batch(df_valid)
+    y_raw = df_valid[target_col].values
+    y = np.log1p(y_raw) if apply_log else y_raw
+    feature_names = feature_engine.get_feature_names()
+    cat_features = feature_engine.get_categorical_features()
+    cat_indices = [feature_names.index(c) for c in cat_features if c in feature_names]
+
+    model = lgb.LGBMRegressor(**params)
+    model.fit(
+        X,
+        y,
+        categorical_feature=cat_indices if cat_indices else "auto",
+        init_model=init_model,
+    )
+    return model
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Train CK Tile kernel performance models"
+    )
+    parser.add_argument(
+        "--data_dir", required=True, help="Directory with parquet files"
+    )
+    parser.add_argument("--out_dir", required=True, help="Output directory for models")
+    parser.add_argument("--op", default="gemm_universal", help="Operation type")
+    parser.add_argument("--dtype", default="fp8", help="Data type filter")
+    parser.add_argument("--arch", default="gfx950", help="Architecture")
+    parser.add_argument(
+        "--targets", default="tflops,latency,bandwidth", help="Comma-separated targets"
+    )
+    parser.add_argument("--n_splits", type=int, default=5, help="Number of CV folds")
+    parser.add_argument(
+        "--tune", action="store_true", help="Run Optuna hyperparameter tuning"
+    )
+    parser.add_argument(
+        "--no_log_transform",
+        action="store_true",
+        help="Disable log1p transform on targets. By default, TFLOPS and bandwidth "
+        "are trained in log-space for scale-invariant accuracy across shape sizes.",
+    )
+    parser.add_argument(
+        "--warm_start",
+        default=None,
+        help="Path to previous model directory to continue training from. "
+        "Uses LightGBM's init_model to add new trees on top of the "
+        "existing model. Feature schemas must match exactly.",
+    )
+    parser.add_argument(
+        "--warm_start_n_estimators",
+        type=int,
+        default=WARM_START_N_ESTIMATORS,
+        help=f"Number of new trees to add when warm-starting (default: {WARM_START_N_ESTIMATORS}). "
+        "Lower than a full train since we're refining, not starting from scratch.",
+    )
+    args = parser.parse_args()
+
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    targets = [t.strip() for t in args.targets.split(",")]
+
+    print(f"Loading data from {args.data_dir}...")
+    df = build_training_dataset(args.data_dir, op_type=args.op, dtype=args.dtype)
+    print(f"  Total rows: {len(df)}")
+    print(f"  Unique shapes: {df.groupby(['m', 'n', 'k']).ngroups}")
+    print(f"  Unique kernels: {df['kernel_name'].nunique()}")
+
+    hw_cols = [c for c in df.columns if c.startswith("hw_")]
+    hw_kwargs = {}
+    if hw_cols:
+        row0 = df.iloc[0]
+        if "hw_num_cus" in df.columns:
+            hw_kwargs["num_cus"] = int(row0.get("hw_num_cus", 256))
+        if "hw_max_clock_mhz" in df.columns:
+            hw_kwargs["max_clock_mhz"] = int(row0.get("hw_max_clock_mhz", 2400))
+        if "hw_simds_per_cu" in df.columns:
+            hw_kwargs["simds_per_cu"] = int(row0.get("hw_simds_per_cu", 4))
+        if "hw_shader_engines" in df.columns:
+            hw_kwargs["shader_engines"] = int(row0.get("hw_shader_engines", 32))
+        if "hw_max_waves_per_cu" in df.columns:
+            hw_kwargs["max_waves_per_cu"] = int(row0.get("hw_max_waves_per_cu", 32))
+        if "hw_wavefront_size" in df.columns:
+            hw_kwargs["wavefront_size"] = int(row0.get("hw_wavefront_size", 64))
+        if "hw_l1_cache_kb" in df.columns:
+            hw_kwargs["l1_cache_kb"] = int(row0.get("hw_l1_cache_kb", 32))
+        if "hw_l2_cache_kb" in df.columns:
+            hw_kwargs["l2_cache_kb"] = int(row0.get("hw_l2_cache_kb", 4096))
+        if "hw_l3_cache_kb" in df.columns:
+            hw_kwargs["l3_cache_kb"] = int(row0.get("hw_l3_cache_kb", 262144))
+
+    fe = GemmUniversalFeatureEngine(**hw_kwargs)
+
+    params = dict(DEFAULT_PARAMS)
+    use_log = not args.no_log_transform
+
+    prev_model_dir = None
+    prev_manifest = {}
+    if args.warm_start:
+        prev_model_dir = Path(args.warm_start)
+        if not prev_model_dir.exists():
+            raise FileNotFoundError(f"Warm-start directory not found: {prev_model_dir}")
+        print(f"  Warm-starting from {prev_model_dir}")
+        check_feature_compatibility(prev_model_dir, fe)
+        print("  Feature compatibility: OK")
+        params["n_estimators"] = args.warm_start_n_estimators
+        print(f"  New trees to add: {args.warm_start_n_estimators}")
+
+        prev_manifest_path = prev_model_dir / "train_manifest.json"
+        if prev_manifest_path.exists():
+            with open(prev_manifest_path) as f:
+                prev_manifest = json.load(f)
+
+    all_cv_results = {}
+    for target in targets:
+        if target not in TARGET_COLUMNS:
+            print(f"  Skipping unknown target: {target}")
+            continue
+
+        print(f"\n{'=' * 60}")
+        print(f"Training {target} model")
+        print(f"{'=' * 60}")
+
+        init_model_path = None
+        if prev_model_dir is not None:
+            init_model_path = load_warm_start_model(prev_model_dir, target)
+            if init_model_path:
+                print(f"  Warm-starting from {init_model_path}")
+            else:
+                print(f"  No previous {target} model found, training from scratch")
+
+        t0 = time.time()
+        cv_result = run_cv(
+            df, fe, target, params, n_splits=args.n_splits, use_log=use_log
+        )
+        cv_time = time.time() - t0
+
+        if cv_result and cv_result["fold_metrics"]:
+            all_cv_results[target] = cv_result["fold_metrics"]
+            metrics_path = out_dir / f"cv_metrics_{target}.json"
+            with open(metrics_path, "w") as f:
+                json.dump(cv_result["fold_metrics"], f, indent=2)
+            print(f"  CV completed in {cv_time:.1f}s, saved to {metrics_path}")
+
+            if target == "tflops" and cv_result.get("oof_df") is not None:
+                oof_df = cv_result["oof_df"]
+                oof_df.to_parquet(out_dir / "oof_predictions.parquet", index=False)
+
+                eff_df = compute_tflops_efficiency(oof_df, "oof_pred_tflops")
+                if len(eff_df) > 0:
+                    print("\n  OOF TFLOPS Efficiency:")
+                    print(f"    Mean: {eff_df['efficiency'].mean():.4f}")
+                    print(f"    P10:  {eff_df['efficiency'].quantile(0.1):.4f}")
+                    print(f"    P50:  {eff_df['efficiency'].quantile(0.5):.4f}")
+                    print(f"    Min:  {eff_df['efficiency'].min():.4f}")
+
+        print(f"\n  Training final {target} model on all data...")
+        t0 = time.time()
+        model = train_final_model(
+            df, fe, target, params, init_model=init_model_path, use_log=use_log
+        )
+        train_time = time.time() - t0
+
+        model_path = out_dir / f"model_{target}.lgbm"
+        model.booster_.save_model(str(model_path))
+        print(f"  Saved {model_path} ({train_time:.1f}s)")
+
+        importances = dict(
+            zip(
+                fe.get_feature_names(),
+                model.feature_importances_.tolist(),
+            )
+        )
+        imp_path = out_dir / f"feature_importances_{target}.json"
+        with open(imp_path, "w") as f:
+            json.dump(importances, f, indent=2)
+
+    log_targets_used = sorted(LOG_TARGETS & set(targets)) if use_log else []
+    spec = {
+        "op_type": args.op,
+        "dtype": args.dtype,
+        "arch": args.arch,
+        "feature_names": fe.get_feature_names(),
+        "categorical_features": fe.get_categorical_features(),
+        "targets": targets,
+        "log_targets": log_targets_used,
+        "params": params,
+    }
+    with open(out_dir / "feature_spec.json", "w") as f:
+        json.dump(spec, f, indent=2)
+
+    manifest = {
+        "warm_start_from": str(prev_model_dir) if prev_model_dir else None,
+        "prev_n_estimators": prev_manifest.get(
+            "total_n_estimators", params.get("n_estimators")
+        )
+        if prev_model_dir
+        else 0,
+        "new_n_estimators": params["n_estimators"],
+        "total_n_estimators": (
+            prev_manifest.get("total_n_estimators", 0) + params["n_estimators"]
+            if prev_model_dir
+            else params["n_estimators"]
+        ),
+        "data_rows": len(df),
+        "valid_rows": int(df["is_valid"].fillna(False).sum()),
+        "unique_shapes": int(df.groupby(["m", "n", "k"]).ngroups),
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
+    }
+    with open(out_dir / "train_manifest.json", "w") as f:
+        json.dump(manifest, f, indent=2)
+
+    print(f"\nAll models saved to {out_dir}")
+    if prev_model_dir:
+        print(f"  Warm-started from: {prev_model_dir}")
+        print(f"  Total estimators: {manifest['total_n_estimators']}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/validate_ml_heuristic.py b/dispatcher/heuristics/validate_ml_heuristic.py
new file mode 100644
index 0000000000..ccd7a20cd9
--- /dev/null
+++ b/dispatcher/heuristics/validate_ml_heuristic.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+ML Heuristic Validation: Test ML predictions against oracle-best from training data
+
+This script validates ML-based kernel selection by:
+1. Loading benchmark data (oracle-best results for each shape)
+2. Using ML model to predict best kernel for each shape
+3. Comparing ML selection with oracle-best to compute efficiency
+
+Usage:
+    python validate_ml_heuristic.py --dtype fp16 --model_dir models/gemm_universal_fp16_gfx950
+    python validate_ml_heuristic.py --dtype fp8 --layout rcr
+"""
+
+import sys
+import argparse
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+from predict import Predictor
+
+
+def validate_ml_heuristic(dtype: str, layout: str, model_dir: str, data_dir: str):
+    """Validate ML heuristic predictions against oracle-best"""
+
+    print("=" * 100)
+    print(f"  ML Heuristic Validation: {dtype.upper()} {layout.upper()}")
+    print("=" * 100)
+    print()
+
+    # Load training data
+    print(f"Loading training data from {data_dir}...")
+
+    # Try dtype-specific parquet first, then fall back to combined
+    dtype_specific = (
+        Path(data_dir) / f"{dtype}_original" / f"{dtype}_training_data.parquet"
+    )
+    combined = Path(data_dir) / "all_training_data_fixed.parquet"
+
+    if dtype_specific.exists():
+        training_data = pd.read_parquet(dtype_specific)
+        print(f"✓ Loaded {len(training_data):,} benchmark runs from {dtype_specific}")
+    elif combined.exists():
+        training_data = pd.read_parquet(combined)
+        training_data = training_data[
+            (training_data["dtype"] == dtype) & (training_data["layout"] == layout)
+        ]
+        print(f"✓ Loaded {len(training_data):,} benchmark runs from {combined}")
+    else:
+        print(f"❌ Error: No training data found at {dtype_specific} or {combined}")
+        return
+
+    if len(training_data) == 0:
+        print(f"❌ Error: No data found for dtype={dtype}, layout={layout}")
+        return
+
+    # Get unique shapes with oracle-best
+    shape_groups = training_data.groupby(["m", "n", "k"])
+    print(f"Unique shapes: {len(shape_groups)}")
+    print()
+
+    # Load ML predictor
+    print(f"Loading ML predictor from {model_dir}...")
+    try:
+        predictor = Predictor(model_dir)
+        print("✓ Loaded ML predictor")
+        print(f"  Log targets: {predictor._log_targets}")
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        return
+
+    print()
+    print("=" * 100)
+    print("  Computing Oracle-Best Efficiency for Each Shape")
+    print("=" * 100)
+    print()
+
+    results = []
+
+    for shape_idx, ((m, n, k), group) in enumerate(shape_groups):
+        # Find oracle-best (max TFLOPS across all kernels tested)
+        oracle_best_row = group.loc[group["measured_tflops"].idxmax()]
+        oracle_best_tflops = oracle_best_row["measured_tflops"]
+        oracle_best_kernel = oracle_best_row["kernel_name"]
+
+        # Get all kernel configs tested for this shape
+        kernel_configs = []
+        for _, row in group.iterrows():
+            kernel_dict = {
+                "tile_m": row["tile_m"],
+                "tile_n": row["tile_n"],
+                "tile_k": row["tile_k"],
+                "warp_m": row["warp_m"],
+                "warp_n": row["warp_n"],
+                "warp_k": row["warp_k"],
+                "warp_tile_m": row["warp_tile_m"],
+                "warp_tile_n": row["warp_tile_n"],
+                "warp_tile_k": row["warp_tile_k"],
+                "pipeline": row["pipeline"],
+                "scheduler": row["scheduler"],
+                "epilogue": row["epilogue"],
+                "pad_m": row["pad_m"],
+                "pad_n": row["pad_n"],
+                "pad_k": row["pad_k"],
+                "persistent": row["persistent"],
+                "kernel_name": row["kernel_name"],
+            }
+            kernel_configs.append(kernel_dict)
+
+        # Use ML model to rank kernels
+        problem = {
+            "m": m,
+            "n": n,
+            "k": k,
+            "dtype": dtype,
+            "layout": layout,
+            "split_k": 1,
+        }
+
+        try:
+            ranked = predictor.rank_kernels(problem, kernel_configs)
+
+            if ranked:
+                ml_best_kernel, ml_predicted_tflops = ranked[0]
+
+                # Find actual TFLOPS for the ML-predicted kernel
+                ml_kernel_row = group[group["kernel_name"] == ml_best_kernel]
+                if len(ml_kernel_row) > 0:
+                    ml_actual_tflops = ml_kernel_row["measured_tflops"].values[0]
+
+                    # Calculate efficiency
+                    efficiency_pct = 100.0 * (ml_actual_tflops / oracle_best_tflops)
+
+                    # Determine if ML picked oracle-best
+                    is_oracle_best = ml_best_kernel == oracle_best_kernel
+
+                    results.append(
+                        {
+                            "m": m,
+                            "n": n,
+                            "k": k,
+                            "oracle_best_tflops": oracle_best_tflops,
+                            "oracle_best_kernel": oracle_best_kernel,
+                            "ml_predicted_tflops": ml_predicted_tflops,
+                            "ml_selected_kernel": ml_best_kernel,
+                            "ml_actual_tflops": ml_actual_tflops,
+                            "efficiency_pct": efficiency_pct,
+                            "is_oracle_best": is_oracle_best,
+                            "num_kernels": len(group),
+                        }
+                    )
+
+                    if (shape_idx + 1) % 20 == 0:
+                        status = "✓" if is_oracle_best else f"{efficiency_pct:.1f}%"
+                        print(
+                            f"  [{shape_idx + 1:3d}/{len(shape_groups)}] "
+                            f"M={m:4d} N={n:5d} K={k:5d}: {status}"
+                        )
+        except Exception as e:
+            print(f"  Error on shape M={m} N={n} K={k}: {e}")
+            continue
+
+    print()
+    print("=" * 100)
+    print("  Results Summary")
+    print("=" * 100)
+    print()
+
+    if results:
+        df_results = pd.DataFrame(results)
+        efficiencies = df_results["efficiency_pct"].values
+        oracle_matches = df_results["is_oracle_best"].sum()
+
+        print(f"Total shapes tested: {len(results)}")
+        print()
+        print("Efficiency Statistics (% of Oracle-Best TFLOPS):")
+        print(f"  Mean:           {np.mean(efficiencies):.2f}%")
+        print(f"  Median:         {np.median(efficiencies):.2f}%")
+        print(f"  Min:            {np.min(efficiencies):.2f}%")
+        print(f"  Max:            {np.max(efficiencies):.2f}%")
+        print(f"  P10:            {np.percentile(efficiencies, 10):.2f}%")
+        print(f"  P50:            {np.percentile(efficiencies, 50):.2f}%")
+        print(f"  P90:            {np.percentile(efficiencies, 90):.2f}%")
+        print()
+        print(
+            f"Oracle-best matches: {oracle_matches}/{len(results)} ({100 * oracle_matches / len(results):.1f}%)"
+        )
+        print()
+
+        # Classify by M size
+        df_results["m_class"] = pd.cut(
+            df_results["m"],
+            bins=[0, 8, 128, 1024, float("inf")],
+            labels=[
+                "Tiny (M<8)",
+                "Small (8≤M<128)",
+                "Medium (128≤M<1024)",
+                "Large (M≥1024)",
+            ],
+        )
+
+        print("Efficiency by M size:")
+        for m_class in [
+            "Tiny (M<8)",
+            "Small (8≤M<128)",
+            "Medium (128≤M<1024)",
+            "Large (M≥1024)",
+        ]:
+            subset = df_results[df_results["m_class"] == m_class]
+            if len(subset) > 0:
+                print(
+                    f"  {m_class:25s}: {subset['efficiency_pct'].mean():6.2f}% "
+                    f"(n={len(subset)}, P10={subset['efficiency_pct'].quantile(0.1):.2f}%)"
+                )
+
+        print()
+
+        # Save results
+        output_file = f"validation_results_{dtype}_{layout}.csv"
+        df_results.to_csv(output_file, index=False)
+        print(f"✓ Results saved to {output_file}")
+
+        # Show best and worst shapes
+        print()
+        print("Top 5 shapes (best efficiency):")
+        top5 = df_results.nlargest(5, "efficiency_pct")[
+            ["m", "n", "k", "efficiency_pct", "oracle_best_tflops", "is_oracle_best"]
+        ]
+        for idx, row in top5.iterrows():
+            match = "✓" if row["is_oracle_best"] else " "
+            print(
+                f"  {match} M={row['m']:5d} N={row['n']:5d} K={row['k']:5d}: "
+                f"{row['efficiency_pct']:.2f}% ({row['oracle_best_tflops']:.2f} TFLOPS)"
+            )
+
+        print()
+        print("Bottom 5 shapes (worst efficiency):")
+        bottom5 = df_results.nsmallest(5, "efficiency_pct")[
+            ["m", "n", "k", "efficiency_pct", "oracle_best_tflops", "is_oracle_best"]
+        ]
+        for idx, row in bottom5.iterrows():
+            match = "✓" if row["is_oracle_best"] else " "
+            print(
+                f"  {match} M={row['m']:5d} N={row['n']:5d} K={row['k']:5d}: "
+                f"{row['efficiency_pct']:.2f}% ({row['oracle_best_tflops']:.2f} TFLOPS)"
+            )
+
+    else:
+        print("No results to display")
+
+    print()
+    print("=" * 100)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Validate ML heuristic predictions against oracle-best from training data"
+    )
+    parser.add_argument(
+        "--dtype",
+        default="fp16",
+        choices=["fp16", "bf16", "fp8"],
+        help="Data type to validate",
+    )
+    parser.add_argument(
+        "--layout",
+        default="rcr",
+        choices=["rcr", "rrr", "crr", "ccr"],
+        help="Matrix layout",
+    )
+    parser.add_argument(
+        "--model_dir",
+        default=None,
+        help="Path to model directory (auto-detect if not specified)",
+    )
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        help="Path to training data directory (auto-detect if not specified)",
+    )
+
+    args = parser.parse_args()
+
+    # Auto-detect model directory if not specified
+    if args.model_dir is None:
+        heuristics_dir = Path(__file__).parent
+        model_candidates = [
+            heuristics_dir / "models" / f"gemm_universal_{args.dtype}_gfx950",
+            heuristics_dir / "models" / f"gemm_universal_{args.dtype}_gfx942",
+        ]
+        for candidate in model_candidates:
+            if candidate.exists():
+                args.model_dir = str(candidate)
+                break
+
+        if args.model_dir is None:
+            print(f"❌ Error: Could not find model directory for {args.dtype}")
+            print(f"   Searched: {[str(c) for c in model_candidates]}")
+            print("   Please specify --model_dir explicitly")
+            return 1
+
+    # Auto-detect data directory if not specified
+    if args.data_dir is None:
+        heuristics_dir = Path(__file__).parent
+        args.data_dir = str(heuristics_dir / "data")
+
+    validate_ml_heuristic(args.dtype, args.layout, args.model_dir, args.data_dir)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/include/ck_tile/dispatcher.hpp b/dispatcher/include/ck_tile/dispatcher.hpp
index 98d8bb9333..b3d8f10675 100644
--- a/dispatcher/include/ck_tile/dispatcher.hpp
+++ b/dispatcher/include/ck_tile/dispatcher.hpp
@@ -3,9 +3,17 @@
 
 #pragma once
 
-/// Main dispatcher header - includes all core components
-/// Use this for convenient access to the full dispatcher API
+/// Full dispatcher header - includes ALL operation types.
+/// For minimal includes, use the per-operation headers instead:
+///   ck_tile/dispatcher_gemm.hpp      -- GEMM only
+///   ck_tile/dispatcher_conv.hpp      -- Grouped Convolution only
 
+// Core (needed by all ops)
+#include "ck_tile/dispatcher/base_registry.hpp"
+#include "ck_tile/dispatcher/dispatcher_error.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+// GEMM
 #include "ck_tile/dispatcher/kernel_key.hpp"
 #include "ck_tile/dispatcher/kernel_config.hpp"
 #include "ck_tile/dispatcher/kernel_decl.hpp"
@@ -13,7 +21,15 @@
 #include "ck_tile/dispatcher/kernel_instance.hpp"
 #include "ck_tile/dispatcher/registry.hpp"
 #include "ck_tile/dispatcher/dispatcher.hpp"
+#include "ck_tile/dispatcher/json_export.hpp"
 #include "ck_tile/dispatcher/arch_filter.hpp"
 #include "ck_tile/dispatcher/backends/tile_backend.hpp"
 #include "ck_tile/dispatcher/backends/generated_tile_backend.hpp"
 #include "ck_tile/dispatcher/utils.hpp"
+
+// Grouped Convolution
+#include "ck_tile/dispatcher/grouped_conv_config.hpp"
+#include "ck_tile/dispatcher/grouped_conv_problem.hpp"
+#include "ck_tile/dispatcher/grouped_conv_kernel_decl.hpp"
+#include "ck_tile/dispatcher/grouped_conv_registry.hpp"
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
diff --git a/dispatcher/include/ck_tile/dispatcher/README.md b/dispatcher/include/ck_tile/dispatcher/README.md
index db3ce996a9..430798aedd 100644
--- a/dispatcher/include/ck_tile/dispatcher/README.md
+++ b/dispatcher/include/ck_tile/dispatcher/README.md
@@ -1,6 +1,6 @@
 # CK Tile Dispatcher - C++ Headers
 
-C++ API for the CK Tile dispatcher.
+C++ API for the CK Tile dispatcher (GEMM and Grouped Convolution).
 
 > **See also:** [Main Dispatcher README](../../../../README.md) for installation and core concepts.
 
@@ -8,16 +8,25 @@ C++ API for the CK Tile dispatcher.
 
 ```
 dispatcher/
-├── dispatcher.hpp          # Main dispatcher (kernel selection)
-├── registry.hpp            # Kernel registry (storage & lookup)
-├── problem.hpp             # Problem specification
-├── kernel_key.hpp          # Kernel configuration key
-├── kernel_instance.hpp     # Kernel instance interface
-├── utils.hpp               # Utilities (timers, GPU buffers)
-│
-└── backends/               # Backend implementations
-    ├── generated_tile_backend.hpp  # CK Tile kernels (production)
-    └── tile_backend.hpp            # Tile backend base
+|---- dispatcher.hpp          # Main include (includes all below)
+|
+|---- # GEMM Headers
+|---- registry.hpp            # Kernel registry (storage & lookup)
+|---- problem.hpp             # GEMM problem specification
+|---- kernel_key.hpp          # Kernel configuration key
+|---- kernel_instance.hpp     # Kernel instance interface
+|---- utils.hpp               # Utilities (timers, GPU buffers)
+|
+|---- # Grouped Convolution Headers
+|---- grouped_conv_config.hpp       # GroupedConvDirection, GroupedConvConfig
+|---- grouped_conv_problem.hpp      # GroupedConvProblem + ProblemBuilder
+|---- grouped_conv_kernel_decl.hpp  # GroupedConvKernelDecl, DECL_GROUPED_CONV_KERNEL_SET
+|---- grouped_conv_registry.hpp     # Thread-safe registry with JSON export & filtering
+|---- grouped_conv_utils.hpp        # Config creators, validation, benchmark utilities
+|
++---- backends/               # Backend implementations
+    |---- generated_tile_backend.hpp  # CK Tile kernels (production)
+    +---- tile_backend.hpp            # Tile backend base
 ```
 
 ## Quick Start
@@ -148,6 +157,69 @@ auto kernel = create_generated_tile_kernel<
 >(key, name);
 ```
 
+## Grouped Convolution API
+
+### GroupedConvProblem (`grouped_conv_problem.hpp`)
+
+Problem specification with builder pattern:
+
+```cpp
+#include "ck_tile/dispatcher/grouped_conv_problem.hpp"
+
+using namespace ck_tile::dispatcher;
+
+auto problem = GroupedConvProblemBuilder()
+    .n(2).g(1).c(128).k(256)
+    .input_spatial({28, 28})
+    .filter_spatial({3, 3})
+    .strides({1, 1})
+    .dilations({1, 1})
+    .left_pads({1, 1})
+    .right_pads({1, 1})
+    .build();
+
+bool ok = problem.is_valid();
+```
+
+### GroupedConvRegistry (`grouped_conv_registry.hpp`)
+
+Thread-safe registry with JSON export and filtering:
+
+```cpp
+#include "ck_tile/dispatcher/grouped_conv_registry.hpp"
+
+auto& registry = GroupedConvRegistry::instance();
+
+// Thread-safe registration
+registry.register_kernel(kernel);
+
+// JSON export
+std::string json = registry.export_json();
+registry.export_json_to_file("kernels.json");
+
+// Filtering
+auto gfx942_kernels = registry.filter_by_arch("gfx942");
+auto matched = registry.filter([](const auto& k) { return k.is_fwd(); });
+```
+
+### DECL_GROUPED_CONV_KERNEL_SET (`grouped_conv_kernel_decl.hpp`)
+
+Declarative kernel definition:
+
+```cpp
+DECL_GROUPED_CONV_KERNEL_SET(my_conv_kernels,
+    .add(
+        GroupedConvSignature().dtype("fp16").layout("nhwgc"),
+        GroupedConvAlgorithm().tile(128, 128, 32).wave(2, 2, 1)
+                             .warp(32, 32, 16).pipeline("compv4"),
+        "gfx942"
+    )
+);
+
+// Register all matching current arch
+DECL_GROUPED_CONV_KERNEL_ALL(all_conv_kernels, "gfx942");
+```
+
 ## Best Practices
 
 1. Use `Release` build for performance
@@ -155,6 +227,8 @@ auto kernel = create_generated_tile_kernel<
 3. Use `Priority::High` for hand-tuned kernels
 4. Reuse dispatcher instances
 5. Clear registry between test runs
+6. Use `GroupedConvProblemBuilder` for validated problem construction
+7. Leverage `export_json()` for kernel inventory and debugging
 
 ---
 
diff --git a/dispatcher/include/ck_tile/dispatcher/backends/generated_conv_backend.hpp b/dispatcher/include/ck_tile/dispatcher/backends/generated_conv_backend.hpp
new file mode 100644
index 0000000000..04ee1b2d11
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/backends/generated_conv_backend.hpp
@@ -0,0 +1,152 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+// Generated Convolution Kernel Backend
+//
+// Wraps CK Tile grouped convolution launchers for use through the
+// GroupedConvDispatcher.  Each generated kernel launcher is wrapped in
+// a ConvKernelRunFn that builds the correct host-args type (forward,
+// bwd-data, or bwd-weight) and calls Launcher::launch().
+
+#pragma once
+
+#include "ck_tile/dispatcher/grouped_conv_problem.hpp"
+#include "ck_tile/dispatcher/grouped_conv_registry.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+#include <hip/hip_runtime.h>
+#include <functional>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace backends {
+
+// Buffer context is defined in grouped_conv_registry.hpp (g_conv_dispatch_buffers)
+// so there's no circular dependency.
+
+// Helper: build ck_tile::conv::ConvParam from GroupedConvProblem
+inline ck_tile::conv::ConvParam make_conv_param_2d(const GroupedConvProblem& p)
+{
+    return ck_tile::conv::ConvParam{
+        2,
+        static_cast<ck_tile::index_t>(p.G),
+        static_cast<ck_tile::index_t>(p.N),
+        static_cast<ck_tile::index_t>(p.K),
+        static_cast<ck_tile::index_t>(p.C),
+        {static_cast<ck_tile::index_t>(p.filter_spatial[1]),
+         static_cast<ck_tile::index_t>(p.filter_spatial[2])},
+        {static_cast<ck_tile::index_t>(p.input_spatial[1]),
+         static_cast<ck_tile::index_t>(p.input_spatial[2])},
+        {static_cast<ck_tile::index_t>(p.stride[1]), static_cast<ck_tile::index_t>(p.stride[2])},
+        {static_cast<ck_tile::index_t>(p.dilation[1]),
+         static_cast<ck_tile::index_t>(p.dilation[2])},
+        {static_cast<ck_tile::index_t>(p.padding[1]), static_cast<ck_tile::index_t>(p.padding[2])},
+        {static_cast<ck_tile::index_t>(p.padding[1]), static_cast<ck_tile::index_t>(p.padding[2])}};
+}
+
+inline ck_tile::conv::ConvParam make_conv_param_3d(const GroupedConvProblem& p)
+{
+    return ck_tile::conv::ConvParam{3,
+                                    static_cast<ck_tile::index_t>(p.G),
+                                    static_cast<ck_tile::index_t>(p.N),
+                                    static_cast<ck_tile::index_t>(p.K),
+                                    static_cast<ck_tile::index_t>(p.C),
+                                    {static_cast<ck_tile::index_t>(p.filter_spatial[0]),
+                                     static_cast<ck_tile::index_t>(p.filter_spatial[1]),
+                                     static_cast<ck_tile::index_t>(p.filter_spatial[2])},
+                                    {static_cast<ck_tile::index_t>(p.input_spatial[0]),
+                                     static_cast<ck_tile::index_t>(p.input_spatial[1]),
+                                     static_cast<ck_tile::index_t>(p.input_spatial[2])},
+                                    {static_cast<ck_tile::index_t>(p.stride[0]),
+                                     static_cast<ck_tile::index_t>(p.stride[1]),
+                                     static_cast<ck_tile::index_t>(p.stride[2])},
+                                    {static_cast<ck_tile::index_t>(p.dilation[0]),
+                                     static_cast<ck_tile::index_t>(p.dilation[1]),
+                                     static_cast<ck_tile::index_t>(p.dilation[2])},
+                                    {static_cast<ck_tile::index_t>(p.padding[0]),
+                                     static_cast<ck_tile::index_t>(p.padding[1]),
+                                     static_cast<ck_tile::index_t>(p.padding[2])},
+                                    {static_cast<ck_tile::index_t>(p.padding[0]),
+                                     static_cast<ck_tile::index_t>(p.padding[1]),
+                                     static_cast<ck_tile::index_t>(p.padding[2])}};
+}
+
+// Create a RunFn for a forward convolution launcher (2D or 3D)
+template <typename LauncherType, int NDim>
+inline GroupedConvKernelInstance::RunFn make_conv_fwd_run_fn()
+{
+    return [](const GroupedConvProblem& problem, void* stream) -> float {
+        auto& ctx  = g_conv_dispatch_buffers;
+        auto param = (NDim == 2) ? make_conv_param_2d(problem) : make_conv_param_3d(problem);
+        ck_tile::GroupedConvFwdHostArgs<> args(
+            param, ctx.input_ptr, ctx.weight_ptr, {}, ctx.output_ptr, 1);
+        ck_tile::stream_config sc;
+        sc.stream_id_    = reinterpret_cast<hipStream_t>(stream);
+        sc.time_kernel_  = ctx.benchmarking;
+        sc.log_level_    = 0;
+        sc.cold_niters_  = ctx.benchmarking ? ctx.warmup : 0;
+        sc.nrepeat_      = ctx.benchmarking ? ctx.repeat : 1;
+        sc.is_gpu_timer_ = ctx.benchmarking;
+        return LauncherType::launch(args, sc);
+    };
+}
+
+// Create a RunFn for a backward-data convolution launcher.
+// Dispatcher convention: run(dY, W, dX, problem) where dX is computed.
+// BwdDataHostArgs(param, in_ptr=dX, wei_ptr=W, {}, out_ptr=dY, k_batch)
+template <typename LauncherType, int NDim>
+inline GroupedConvKernelInstance::RunFn make_conv_bwd_data_run_fn()
+{
+    return [](const GroupedConvProblem& problem, void* stream) -> float {
+        auto& ctx  = g_conv_dispatch_buffers;
+        auto param = (NDim == 2) ? make_conv_param_2d(problem) : make_conv_param_3d(problem);
+        ck_tile::GroupedConvBwdDataHostArgs args(
+            param,
+            ctx.output_ptr, // in_ptr = dX (being computed)
+            ctx.weight_ptr, // wei_ptr = W
+            {},
+            ctx.input_ptr, // out_ptr = dY (gradient from next layer)
+            1);
+        ck_tile::stream_config sc;
+        sc.stream_id_    = reinterpret_cast<hipStream_t>(stream);
+        sc.time_kernel_  = ctx.benchmarking;
+        sc.log_level_    = 0;
+        sc.cold_niters_  = ctx.benchmarking ? ctx.warmup : 0;
+        sc.nrepeat_      = ctx.benchmarking ? ctx.repeat : 1;
+        sc.is_gpu_timer_ = ctx.benchmarking;
+        return LauncherType::launch(args, sc);
+    };
+}
+
+// Create a RunFn for a backward-weight convolution launcher.
+// Dispatcher convention: run(X, dY, dW, problem) where dW is computed.
+// BwdWeightHostArgs(param, in_ptr=X, wei_ptr=dW, {}, out_ptr=dY, k_batch)
+template <typename LauncherType, int NDim>
+inline GroupedConvKernelInstance::RunFn make_conv_bwd_weight_run_fn()
+{
+    return [](const GroupedConvProblem& problem, void* stream) -> float {
+        auto& ctx         = g_conv_dispatch_buffers;
+        auto param        = (NDim == 2) ? make_conv_param_2d(problem) : make_conv_param_3d(problem);
+        const int k_batch = (ctx.split_k > 1) ? ctx.split_k : 1;
+        ck_tile::GroupedConvBwdWeightHostArgs args(param,
+                                                   ctx.input_ptr,  // in_ptr = X
+                                                   ctx.output_ptr, // wei_ptr = dW (being computed)
+                                                   {},
+                                                   ctx.weight_ptr, // out_ptr = dY
+                                                   k_batch);
+        ck_tile::stream_config sc;
+        sc.stream_id_    = reinterpret_cast<hipStream_t>(stream);
+        sc.time_kernel_  = ctx.benchmarking;
+        sc.log_level_    = 0;
+        sc.cold_niters_  = ctx.benchmarking ? ctx.warmup : 0;
+        sc.nrepeat_      = ctx.benchmarking ? ctx.repeat : 1;
+        sc.is_gpu_timer_ = ctx.benchmarking;
+        return LauncherType::launch(args, sc);
+    };
+}
+
+} // namespace backends
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/include/ck_tile/dispatcher/base_registry.hpp b/dispatcher/include/ck_tile/dispatcher/base_registry.hpp
new file mode 100644
index 0000000000..2bb940c320
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/base_registry.hpp
@@ -0,0 +1,199 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace ck_tile {
+namespace dispatcher {
+
+/// Shared priority enum used by all registry types
+enum class Priority
+{
+    Low    = 0,
+    Normal = 1,
+    High   = 2
+};
+
+/// BaseRegistry: Thread-safe, priority-aware kernel storage shared by GEMM and Conv registries.
+///
+/// Template Parameters:
+///   Derived      - CRTP derived class (e.g., Registry, ConvRegistry)
+///   KeyType      - primary key type (std::string for GEMM, ConvKernelKey for Conv)
+///   InstanceType - kernel instance type (KernelInstance, ConvKernelInstance)
+///   KeyHash      - hash functor for KeyType (defaults to std::hash<KeyType>)
+template <typename Derived,
+          typename KeyType,
+          typename InstanceType,
+          typename KeyHash = std::hash<KeyType>>
+class BaseRegistry
+{
+    public:
+    using InstancePtr = std::shared_ptr<InstanceType>;
+
+    struct Entry
+    {
+        InstancePtr instance;
+        Priority priority;
+    };
+
+    BaseRegistry()          = default;
+    virtual ~BaseRegistry() = default;
+
+    BaseRegistry(BaseRegistry&& other) noexcept
+    {
+        std::lock_guard<std::mutex> lock(other.mutex_);
+        entries_ = std::move(other.entries_);
+        name_    = std::move(other.name_);
+    }
+
+    BaseRegistry& operator=(BaseRegistry&& other) noexcept
+    {
+        if(this != &other)
+        {
+            std::scoped_lock lock(mutex_, other.mutex_);
+            entries_ = std::move(other.entries_);
+            name_    = std::move(other.name_);
+        }
+        return *this;
+    }
+
+    BaseRegistry(const BaseRegistry&)            = delete;
+    BaseRegistry& operator=(const BaseRegistry&) = delete;
+
+    /// Register a kernel. If the key already exists, the new entry replaces it
+    /// unless the existing entry has strictly higher priority.
+    /// Same-priority registration overwrites (last-writer-wins at equal priority).
+    bool
+    register_kernel(const KeyType& key, InstancePtr instance, Priority priority = Priority::Normal)
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        auto it = entries_.find(key);
+        if(it != entries_.end() && it->second.priority > priority)
+        {
+            return false;
+        }
+        entries_[key] = Entry{std::move(instance), priority};
+        return true;
+    }
+
+    [[nodiscard]] std::size_t size() const
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        return entries_.size();
+    }
+
+    [[nodiscard]] bool empty() const
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        return entries_.empty();
+    }
+
+    void clear()
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        entries_.clear();
+    }
+
+    [[nodiscard]] std::string get_name() const
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        return name_; // return by value to avoid dangling reference
+    }
+
+    void set_name(const std::string& name)
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        name_ = name;
+    }
+
+    [[nodiscard]] std::vector<InstancePtr> get_all_instances() const
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        std::vector<InstancePtr> result;
+        result.reserve(entries_.size());
+        for(const auto& [key, entry] : entries_)
+        {
+            result.push_back(entry.instance);
+        }
+        return result;
+    }
+
+    std::size_t merge_from(const BaseRegistry& other, Priority priority = Priority::Normal)
+    {
+        std::scoped_lock lock(mutex_, other.mutex_);
+        std::size_t merged = 0;
+        for(const auto& [key, entry] : other.entries_)
+        {
+            auto it = entries_.find(key);
+            if(it == entries_.end() || it->second.priority <= priority)
+            {
+                entries_[key] = Entry{entry.instance, priority};
+                ++merged;
+            }
+        }
+        return merged;
+    }
+
+    /// Enable automatic JSON export after every kernel registration.
+    /// Requires the derived class to implement export_json_to_file(path, stats).
+    void enable_auto_export(const std::string& path,
+                            bool include_statistics           = true,
+                            bool export_on_every_registration = true)
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        auto_export_path_        = path;
+        auto_export_stats_       = include_statistics;
+        auto_export_on_register_ = export_on_every_registration;
+        auto_export_enabled_.store(true, std::memory_order_release);
+    }
+
+    void disable_auto_export() { auto_export_enabled_.store(false, std::memory_order_release); }
+
+    [[nodiscard]] bool is_auto_export_enabled() const
+    {
+        return auto_export_enabled_.load(std::memory_order_acquire);
+    }
+
+    /// Call after registration to trigger auto-export if enabled.
+    void perform_auto_export()
+    {
+        if(!auto_export_enabled_.load(std::memory_order_acquire))
+            return;
+        std::lock_guard<std::mutex> lock(mutex_);
+        if(auto_export_on_register_)
+        {
+            static_cast<Derived*>(this)->export_json_to_file(auto_export_path_, auto_export_stats_);
+        }
+    }
+
+    protected:
+    [[nodiscard]] const std::unordered_map<KeyType, Entry, KeyHash>& entries() const
+    {
+        return entries_;
+    }
+
+    [[nodiscard]] std::unordered_map<KeyType, Entry, KeyHash>& entries_mut() { return entries_; }
+
+    std::mutex& mutex() const { return mutex_; }
+
+    private:
+    mutable std::mutex mutex_;
+    std::unordered_map<KeyType, Entry, KeyHash> entries_;
+    std::string name_ = "default";
+
+    std::atomic<bool> auto_export_enabled_{false};
+    bool auto_export_on_register_ = true;
+    bool auto_export_stats_       = true;
+    std::string auto_export_path_;
+};
+
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/include/ck_tile/dispatcher/dispatcher.hpp b/dispatcher/include/ck_tile/dispatcher/dispatcher.hpp
index 6d3f548138..d266d693da 100644
--- a/dispatcher/include/ck_tile/dispatcher/dispatcher.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/dispatcher.hpp
@@ -23,6 +23,7 @@
 
 #pragma once
 
+#include "ck_tile/dispatcher/dispatcher_error.hpp"
 #include "ck_tile/dispatcher/kernel_instance.hpp"
 #include "ck_tile/dispatcher/problem.hpp"
 #include "ck_tile/dispatcher/registry.hpp"
@@ -52,7 +53,11 @@ class Dispatcher
 
     /// Constructor
     /// @param registry Registry instance to use (default: global singleton)
-    explicit Dispatcher(Registry* registry = nullptr);
+    /// @param gfx_arch Target GPU architecture (e.g. "gfx950")
+    explicit Dispatcher(Registry* registry = nullptr, const std::string& gfx_arch = "");
+
+    void set_arch(const std::string& arch) { gfx_arch_ = arch; }
+    [[nodiscard]] const std::string& arch() const { return gfx_arch_; }
 
     /// Register a heuristic function for kernel selection
     /// @param heuristic Function that maps problems to ranked kernel identifiers
@@ -74,7 +79,7 @@ class Dispatcher
     /// @param problem Problem configuration
     /// @param stream HIP stream for kernel launch (nullptr = default stream)
     /// @return Kernel execution time in milliseconds
-    /// @throws std::runtime_error if no suitable kernel found
+    /// @throws NoKernelFound if no suitable kernel found
     [[nodiscard]] float run(const void* a_ptr,
                             const void* b_ptr,
                             void* c_ptr,
@@ -89,7 +94,7 @@ class Dispatcher
     /// @param problem Problem configuration
     /// @param stream HIP stream for kernel launch (nullptr = default stream)
     /// @return Kernel execution time in milliseconds
-    /// @throws std::runtime_error if no suitable kernel found
+    /// @throws NoKernelFound if no suitable kernel found
     [[nodiscard]] float run_fused(const void* a_ptr,
                                   const void* b_ptr,
                                   void* c_ptr,
@@ -106,7 +111,8 @@ class Dispatcher
     /// @param problem Problem configuration
     /// @param stream HIP stream for kernel launch (nullptr = default stream)
     /// @return Kernel execution time in milliseconds
-    /// @throws std::runtime_error if kernel not found or doesn't support problem
+    /// @throws NoKernelFound if the kernel identifier is not registered
+    /// @throws UnsupportedProblem if the selected kernel does not support the problem
     [[nodiscard]] float run_explicit(const std::string& kernel_id,
                                      const void* a_ptr,
                                      const void* b_ptr,
@@ -130,10 +136,18 @@ class Dispatcher
                                 const Problem& problem,
                                 float tolerance = 1e-3f) const;
 
+    /// Enable or disable GPU benchmarking (timing) on all kernels.
+    /// When disabled, kernels execute once with no timing overhead
+    /// (one-shot mode for production plugins).
+    void set_benchmarking(bool enable) { benchmarking_ = enable; }
+    [[nodiscard]] bool benchmarking_enabled() const { return benchmarking_; }
+
     private:
     Registry* registry_;
     HeuristicFunction heuristic_;
     SelectionStrategy strategy_;
+    std::string gfx_arch_;
+    bool benchmarking_ = true;
 
     /// Select kernel using first-fit strategy
     [[nodiscard]] KernelInstancePtr select_first_fit(const Problem& problem) const;
diff --git a/dispatcher/include/ck_tile/dispatcher/dispatcher_error.hpp b/dispatcher/include/ck_tile/dispatcher/dispatcher_error.hpp
new file mode 100644
index 0000000000..98b079f8d9
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/dispatcher_error.hpp
@@ -0,0 +1,28 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+namespace ck_tile {
+namespace dispatcher {
+
+struct DispatcherError : std::runtime_error
+{
+    using std::runtime_error::runtime_error;
+};
+
+struct NoKernelFound : DispatcherError
+{
+    using DispatcherError::DispatcherError;
+};
+
+struct UnsupportedProblem : DispatcherError
+{
+    using DispatcherError::DispatcherError;
+};
+
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/include/ck_tile/dispatcher/dispatcher_log.hpp b/dispatcher/include/ck_tile/dispatcher/dispatcher_log.hpp
new file mode 100644
index 0000000000..6a39766649
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/dispatcher_log.hpp
@@ -0,0 +1,55 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <string>
+
+namespace ck_tile {
+namespace dispatcher {
+
+/// Log levels for dispatcher transparency:
+///   0 = silent (default)
+///   1 = print selected kernel name
+///   2 = print all candidates considered and acceptance/rejection reasons
+inline int get_log_level()
+{
+    static int level = []() {
+        const char* env = std::getenv("CK_DISPATCHER_LOG_LEVEL");
+        return env ? std::atoi(env) : 0;
+    }();
+    return level;
+}
+
+inline void log_kernel_selected(const std::string& kernel_name, const std::string& problem_desc)
+{
+    if(get_log_level() >= 1)
+    {
+        std::cerr << "[CK Dispatcher] Selected kernel: " << kernel_name << " for " << problem_desc
+                  << std::endl;
+    }
+}
+
+inline void
+log_kernel_candidate(const std::string& kernel_name, bool accepted, const std::string& reason)
+{
+    if(get_log_level() >= 2)
+    {
+        std::cerr << "[CK Dispatcher]   Candidate: " << kernel_name << " -> "
+                  << (accepted ? "ACCEPTED" : "REJECTED")
+                  << (reason.empty() ? "" : " (" + reason + ")") << std::endl;
+    }
+}
+
+inline void log_no_kernel_found(const std::string& problem_desc)
+{
+    if(get_log_level() >= 1)
+    {
+        std::cerr << "[CK Dispatcher] No kernel found for " << problem_desc << std::endl;
+    }
+}
+
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/include/ck_tile/dispatcher/grouped_conv_config.hpp b/dispatcher/include/ck_tile/dispatcher/grouped_conv_config.hpp
new file mode 100644
index 0000000000..91b7b3ad74
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/grouped_conv_config.hpp
@@ -0,0 +1,588 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file grouped_conv_config.hpp
+ * @brief CK Tile Grouped Convolution Configuration with Builder-style naming
+ *
+ * This adopts the Signature/Algorithm/Arch pattern from:
+ *   experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp
+ *
+ * Structure:
+ *   - Signature: WHAT operation (types, layouts, direction, element ops)
+ *   - Algorithm: HOW it's computed (tiles, warps, pipeline, scheduler, padding)
+ *   - Arch: Target GPU architecture
+ */
+
+#pragma once
+
+// Use common kernel_key types for DataType, Pipeline, etc.
+#include "ck_tile/dispatcher/kernel_key.hpp"
+
+#include <string>
+#include <sstream>
+#include <array>
+#include <cstdint>
+
+namespace ck_tile {
+namespace dispatcher {
+
+// DataType, Pipeline, Scheduler, Epilogue are defined in kernel_key.hpp
+// No need to redefine them here
+
+// =============================================================================
+// Data Type Enum (matching CK Tile numeric types)
+// =============================================================================
+
+enum class ConvDataType
+{
+    // Standard floating point
+    FP32, // float
+    FP64, // double
+    FP16, // half_t
+    BF16, // bf16_t
+
+    // 8-bit float variants (FP8/BF8)
+    FP8,      // fp8_t (E4M3)
+    BF8,      // bf8_t (E5M2)
+    FP8_E4M3, // Explicit E4M3 format
+    FP8_E5M2, // Explicit E5M2 format
+
+    // Integer types
+    INT8,  // int8_t
+    UINT8, // uint8_t
+    INT32, // int32_t (accumulator)
+
+    // 4-bit types (gfx950+ only)
+    FP4, // MXFP4
+    INT4 // pk_int4_t
+};
+
+// =============================================================================
+// Direction and Layout Enums
+// =============================================================================
+
+enum class GroupedConvDirection
+{
+    FORWARD,
+    BACKWARD_DATA,
+    BACKWARD_WEIGHT
+};
+
+enum class ConvLayout2D
+{
+    GNHWC_GKYXC_GNHWK, // NHWC-style
+    NHWGC_GKYXC_NHWGK,
+    NGCHW_GKYXC_NGKHW, // NCHW-style
+    NGCHW_GKCYX_NGKHW
+};
+
+enum class ConvLayout3D
+{
+    GNDHWC_GKZYXC_GNDHWK,
+    NDHWGC_GKZYXC_NDHWGK,
+    NGCDHW_GKZYXC_NGKDHW,
+    NGCDHW_GKCZYX_NGKDHW
+};
+
+// =============================================================================
+// Element-wise Operations
+// =============================================================================
+
+enum class ElementwiseOp
+{
+    PASS_THROUGH,
+    BIAS,
+    BIAS_CLAMP,
+    SCALE,
+    BILINEAR,
+    RELU,
+    GELU,
+    SIGMOID,
+    TANH
+};
+
+// =============================================================================
+// Grouped Convolution Specialization
+// =============================================================================
+
+enum class ConvSpecialization
+{
+    DEFAULT,
+    FILTER_1X1_PAD0,
+    FILTER_1X1_STRIDE1_PAD0,
+    FILTER_3X3,
+    FILTER_5X5,
+    FILTER_7X7
+};
+
+// =============================================================================
+// Memory Operation Types (for accumulator operations)
+// =============================================================================
+
+enum class MemoryOperation
+{
+    SET,        // Direct write (=)
+    ATOMIC_ADD, // Atomic addition (+=)
+    ATOMIC_MAX, // Atomic max
+    ADD         // Non-atomic addition
+};
+
+// =============================================================================
+// Epilogue Types
+// =============================================================================
+
+enum class EpilogueType
+{
+    CSHUFFLE,        // C-shuffle epilogue
+    DEFAULT_2D,      // Default 2D epilogue
+    DEFAULT_GEMM_2D, // Default GEMM 2D epilogue
+    DIRECT_STORE,    // Direct store without shuffle
+    BIAS_ADD,        // Add bias
+    BIAS_ADD_RELU,   // Add bias + ReLU
+    BIAS_ADD_GELU    // Add bias + GELU
+};
+
+// =============================================================================
+// Algorithm Enums (matching builder/types.hpp and CK Tile pipelines)
+// =============================================================================
+
+enum class PipelineVersion
+{
+    V1,            // Basic pipeline V1
+    V2,            // Basic pipeline V2
+    V3,            // Compute V3 (intrawave only)
+    V4,            // Compute V4 (double buffer, ping-pong LDS)
+    V5,            // Compute V5 (wave groups)
+    V6,            // Compute V6 (newest)
+    MEMORY,        // Memory pipeline
+    COMPUTE_ASYNC, // Compute with async copy
+    PRESHUFFLE_V2  // Preshuffle V2 pipeline
+};
+
+enum class PipelineScheduler
+{
+    DEFAULT,
+    INTRAWAVE,
+    INTERWAVE
+};
+
+enum class GemmPadding
+{
+    DEFAULT,
+    NO_PADDING, // No padding
+    M_PADDING,
+    N_PADDING,
+    K_PADDING,
+    MN_PADDING,
+    MK_PADDING,
+    NK_PADDING,
+    MNK_PADDING
+};
+
+// =============================================================================
+// Signature Info (WHAT operation)
+// =============================================================================
+
+struct GroupedConvSignatureInfo
+{
+    int spatial_dim                = 2; // 1, 2, or 3
+    GroupedConvDirection direction = GroupedConvDirection::FORWARD;
+    std::string in_type            = "fp16";
+    std::string wei_type           = "fp16";
+    std::string out_type           = "fp16";
+    std::string acc_type           = "fp32";
+    std::string workspace_type     = "fp32"; // For two-stage algorithms
+    std::string bias_type          = "fp16"; // For bias epilogue
+    ElementwiseOp in_element_op    = ElementwiseOp::PASS_THROUGH;
+    ElementwiseOp wei_element_op   = ElementwiseOp::PASS_THROUGH;
+    ElementwiseOp out_element_op   = ElementwiseOp::PASS_THROUGH;
+    ConvSpecialization conv_spec   = ConvSpecialization::DEFAULT;
+    int num_groups                 = 1;
+
+    // String helpers
+    static const char* direction_str(GroupedConvDirection dir)
+    {
+        switch(dir)
+        {
+        case GroupedConvDirection::FORWARD: return "fwd";
+        case GroupedConvDirection::BACKWARD_DATA: return "bwd_data";
+        case GroupedConvDirection::BACKWARD_WEIGHT: return "bwd_weight";
+        default: return "unknown";
+        }
+    }
+
+    static const char* datatype_str(ConvDataType dt)
+    {
+        switch(dt)
+        {
+        case ConvDataType::FP32: return "fp32";
+        case ConvDataType::FP64: return "fp64";
+        case ConvDataType::FP16: return "fp16";
+        case ConvDataType::BF16: return "bf16";
+        case ConvDataType::FP8: return "fp8";
+        case ConvDataType::BF8: return "bf8";
+        case ConvDataType::FP8_E4M3: return "fp8_e4m3";
+        case ConvDataType::FP8_E5M2: return "fp8_e5m2";
+        case ConvDataType::INT8: return "int8";
+        case ConvDataType::UINT8: return "uint8";
+        case ConvDataType::INT32: return "int32";
+        case ConvDataType::FP4: return "fp4";
+        case ConvDataType::INT4: return "int4";
+        default: return "unknown";
+        }
+    }
+};
+
+// =============================================================================
+// Algorithm Info (HOW it's computed)
+// =============================================================================
+
+struct DataTileInfo
+{
+    int m = 128; // M tile (output spatial * N)
+    int n = 128; // N tile (K output channels)
+    int k = 64;  // K tile (C input channels)
+};
+
+struct WarpGemmParams
+{
+    int gemm_m = 16; // MFMA M dimension (MPerXDL)
+    int gemm_n = 16; // MFMA N dimension (NPerXDL)
+    int m_iter = 2;  // M iterations per warp (MXdlPerWave)
+    int n_iter = 2;  // N iterations per warp (NXdlPerWave)
+};
+
+struct BlockWarpConfig
+{
+    int m_warp      = 2;  // Warps along M
+    int n_warp      = 2;  // Warps along N
+    int k_warp      = 1;  // Warps along K
+    int m_warp_tile = 32; // Warp tile M
+    int n_warp_tile = 32; // Warp tile N
+    int k_warp_tile = 16; // Warp tile K
+};
+
+struct VectorSizeInfo
+{
+    int a = 4; // Input vector size
+    int b = 8; // Weight vector size
+    int c = 8; // Output vector size
+};
+
+struct GroupedConvAlgorithmInfo
+{
+    DataTileInfo tile;
+    BlockWarpConfig warp;
+    VectorSizeInfo vector_size;
+
+    PipelineVersion pipeline    = PipelineVersion::V4;
+    PipelineScheduler scheduler = PipelineScheduler::INTRAWAVE;
+    GemmPadding padding         = GemmPadding::MNK_PADDING;
+    MemoryOperation memory_op   = MemoryOperation::SET;
+    EpilogueType epilogue       = EpilogueType::CSHUFFLE;
+
+    int thread_block_size   = 256;
+    bool double_smem_buffer = false;
+    int num_wave_groups     = 1;
+    int block_per_cu        = 1;
+    int num_groups_to_merge = 1;
+
+    // Pipeline string
+    static const char* pipeline_str(PipelineVersion pv)
+    {
+        switch(pv)
+        {
+        case PipelineVersion::V1: return "v1";
+        case PipelineVersion::V2: return "v2";
+        case PipelineVersion::V3: return "compv3";
+        case PipelineVersion::V4: return "compv4";
+        case PipelineVersion::V5: return "compv5";
+        case PipelineVersion::V6: return "compv6";
+        case PipelineVersion::MEMORY: return "mem";
+        case PipelineVersion::COMPUTE_ASYNC: return "comp_async";
+        case PipelineVersion::PRESHUFFLE_V2: return "preshuffle_v2";
+        default: return "unknown";
+        }
+    }
+
+    static const char* scheduler_str(PipelineScheduler ps)
+    {
+        switch(ps)
+        {
+        case PipelineScheduler::DEFAULT: return "default";
+        case PipelineScheduler::INTRAWAVE: return "intrawave";
+        case PipelineScheduler::INTERWAVE: return "interwave";
+        default: return "unknown";
+        }
+    }
+
+    static const char* memory_op_str(MemoryOperation mo)
+    {
+        switch(mo)
+        {
+        case MemoryOperation::SET: return "set";
+        case MemoryOperation::ATOMIC_ADD: return "atomic_add";
+        case MemoryOperation::ATOMIC_MAX: return "atomic_max";
+        case MemoryOperation::ADD: return "add";
+        default: return "unknown";
+        }
+    }
+
+    static const char* epilogue_str(EpilogueType et)
+    {
+        switch(et)
+        {
+        case EpilogueType::CSHUFFLE: return "cshuffle";
+        case EpilogueType::DEFAULT_2D: return "default_2d";
+        case EpilogueType::DEFAULT_GEMM_2D: return "default_gemm_2d";
+        case EpilogueType::DIRECT_STORE: return "direct_store";
+        case EpilogueType::BIAS_ADD: return "bias_add";
+        case EpilogueType::BIAS_ADD_RELU: return "bias_add_relu";
+        case EpilogueType::BIAS_ADD_GELU: return "bias_add_gelu";
+        default: return "unknown";
+        }
+    }
+};
+
+// =============================================================================
+// Arch Info (Target GPU)
+// =============================================================================
+
+struct ArchInfo
+{
+    std::string name     = "gfx942"; // MI300X default
+    int max_waves_per_cu = 8;
+    int lds_size_kb      = 64;
+    int sgpr_count       = 108;
+    int vgpr_count       = 512;
+
+    bool supports_mfma_fp16() const { return name.find("gfx9") != std::string::npos; }
+    bool supports_wmma() const { return name.find("gfx11") != std::string::npos; }
+};
+
+// =============================================================================
+// Full Grouped Conv Config (combines Signature + Algorithm + Arch)
+// =============================================================================
+
+struct GroupedConvConfig
+{
+    GroupedConvSignatureInfo signature;
+    GroupedConvAlgorithmInfo algorithm;
+    ArchInfo arch;
+
+    // Generate unique kernel name
+    std::string name() const
+    {
+        std::ostringstream oss;
+        oss << "grouped_conv_" << GroupedConvSignatureInfo::direction_str(signature.direction)
+            << "_" << signature.in_type << "_" << signature.spatial_dim << "d" << "_"
+            << GroupedConvAlgorithmInfo::pipeline_str(algorithm.pipeline) << "_" << algorithm.tile.m
+            << "x" << algorithm.tile.n << "x" << algorithm.tile.k;
+        return oss.str();
+    }
+
+    // Brief description
+    std::string brief() const
+    {
+        std::ostringstream oss;
+        oss << signature.spatial_dim << "D "
+            << GroupedConvSignatureInfo::direction_str(signature.direction)
+            << " Grouped Convolution (" << signature.in_type << ")";
+        return oss.str();
+    }
+
+    // Detailed description (tree-like)
+    std::string detailed() const
+    {
+        std::ostringstream oss;
+        oss << signature.spatial_dim << "D "
+            << GroupedConvSignatureInfo::direction_str(signature.direction)
+            << " Grouped Convolution Kernel\n";
+
+        oss << "  Signature:\n";
+        oss << "    Data Type: " << signature.in_type << "\n";
+        oss << "    Accumulator: " << signature.acc_type << "\n";
+        oss << "    Groups: " << signature.num_groups << "\n";
+
+        oss << "  Algorithm:\n";
+        oss << "    Thread Block Size: " << algorithm.thread_block_size << "\n";
+        oss << "    Data Tile: " << algorithm.tile.m << "x" << algorithm.tile.n << "x"
+            << algorithm.tile.k << "\n";
+        oss << "    Warp Config: " << algorithm.warp.m_warp << "x" << algorithm.warp.n_warp << "x"
+            << algorithm.warp.k_warp << "\n";
+        oss << "    Warp Tile: " << algorithm.warp.m_warp_tile << "x" << algorithm.warp.n_warp_tile
+            << "x" << algorithm.warp.k_warp_tile << "\n";
+        oss << "    Pipeline: " << GroupedConvAlgorithmInfo::pipeline_str(algorithm.pipeline)
+            << "\n";
+        oss << "    Scheduler: " << GroupedConvAlgorithmInfo::scheduler_str(algorithm.scheduler)
+            << "\n";
+
+        oss << "  Arch:\n";
+        oss << "    Target: " << arch.name << "\n";
+
+        return oss.str();
+    }
+};
+
+// =============================================================================
+// Predefined Configs
+// =============================================================================
+
+namespace configs {
+
+// Memory-bound config
+template <typename PrecType>
+struct Memory : public GroupedConvConfig
+{
+    Memory()
+    {
+        algorithm.tile               = {128, 32, 128 / (int)sizeof(PrecType)};
+        algorithm.warp               = {4, 1, 1, 32, 32, 16};
+        algorithm.pipeline           = PipelineVersion::MEMORY;
+        algorithm.double_smem_buffer = false;
+    }
+};
+
+// Compute V3 - Small
+template <typename PrecType>
+struct CompV3_Small : public GroupedConvConfig
+{
+    CompV3_Small()
+    {
+        algorithm.tile     = {16, 64, 64};
+        algorithm.warp     = {1, 4, 1, 16, 16, 32};
+        algorithm.pipeline = PipelineVersion::V3;
+    }
+};
+
+// Compute V3 - Medium
+template <typename PrecType>
+struct CompV3_Medium : public GroupedConvConfig
+{
+    CompV3_Medium()
+    {
+        algorithm.tile         = {128, 128, 128 / (int)sizeof(PrecType)};
+        algorithm.warp         = {2, 2, 1, 16, 16, 32};
+        algorithm.pipeline     = PipelineVersion::V3;
+        algorithm.block_per_cu = 2;
+    }
+};
+
+// Compute V3 - Large
+template <typename PrecType>
+struct CompV3_Large : public GroupedConvConfig
+{
+    CompV3_Large()
+    {
+        algorithm.tile     = {256, 256, 128 / (int)sizeof(PrecType)};
+        algorithm.warp     = {2, 2, 1, 32, 32, 16};
+        algorithm.pipeline = PipelineVersion::V3;
+    }
+};
+
+// Compute V4 - Double buffered
+template <typename PrecType>
+struct CompV4 : public GroupedConvConfig
+{
+    CompV4()
+    {
+        algorithm.tile               = {256, 256, 64 / (int)sizeof(PrecType)};
+        algorithm.warp               = {2, 2, 1, 32, 32, 16};
+        algorithm.pipeline           = PipelineVersion::V4;
+        algorithm.double_smem_buffer = true;
+    }
+};
+
+// Compute V5 - Wave groups
+template <typename PrecType>
+struct CompV5 : public GroupedConvConfig
+{
+    CompV5()
+    {
+        algorithm.tile            = {128, 128, 64 / (int)sizeof(PrecType)};
+        algorithm.warp            = {1, 1, 2, 32, 32, 16};
+        algorithm.pipeline        = PipelineVersion::V5;
+        algorithm.num_wave_groups = 2;
+    }
+};
+
+// WMMA config for gfx11xx
+template <typename PrecType>
+struct WMMA : public GroupedConvConfig
+{
+    WMMA()
+    {
+        algorithm.tile         = {128, 128, 64 / (int)sizeof(PrecType)};
+        algorithm.warp         = {4, 2, 1, 16, 16, 16};
+        algorithm.pipeline     = PipelineVersion::V3;
+        algorithm.block_per_cu = 2;
+        arch.name              = "gfx1100";
+    }
+};
+
+// Merged groups config
+template <typename PrecType>
+struct CompV3_MergedGroups : public GroupedConvConfig
+{
+    CompV3_MergedGroups()
+    {
+        algorithm.tile                = {16, 32, 32};
+        algorithm.warp                = {1, 2, 1, 16, 16, 32};
+        algorithm.vector_size         = {4, 8, 8};
+        algorithm.pipeline            = PipelineVersion::V3;
+        algorithm.num_groups_to_merge = 2;
+    }
+};
+
+} // namespace configs
+
+// =============================================================================
+// DataType Traits (compile-time type info for CK Tile types)
+// =============================================================================
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+    static constexpr int size_bytes   = 4;
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+    static constexpr int size_bytes   = 8;
+};
+
+// Forward declare CK Tile types for traits
+// Note: actual ck_tile types are defined in ck_tile/core/numeric/
+// These traits allow working with type names at compile time
+
+// =============================================================================
+// ConvTypeConfig (input/weight/acc/output type combinations)
+// =============================================================================
+
+template <typename InDataType,
+          typename WeiDataType = InDataType,
+          typename OutDataType = InDataType,
+          typename AccDataType = float>
+struct ConvTypeConfig
+{
+    using input_type       = InDataType;
+    using weight_type      = WeiDataType;
+    using output_type      = OutDataType;
+    using accumulator_type = AccDataType;
+};
+
+// Common type configurations as type aliases
+// FP16 -> FP32 accumulator -> FP16 output (most common)
+// BF16 -> FP32 accumulator -> BF16 output
+// FP8 -> FP32 accumulator -> FP8 output
+// INT8 -> INT32 accumulator -> INT8 output
+
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/include/ck_tile/dispatcher/grouped_conv_kernel_decl.hpp b/dispatcher/include/ck_tile/dispatcher/grouped_conv_kernel_decl.hpp
new file mode 100644
index 0000000000..8ddfe445ff
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/grouped_conv_kernel_decl.hpp
@@ -0,0 +1,537 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file grouped_conv_kernel_decl.hpp
+ * @brief Declarative grouped convolution kernel specification
+ *
+ * USAGE:
+ * ======
+ *
+ * // Named kernel sets for grouped convolution
+ * DECL_GROUPED_CONV_KERNEL_SET(gconv_fwd,
+ *     .add("fp16", "nhwc", "forward", 128, 128, 32)
+ *     .add("fp16", "nhwc", "forward", 256, 256, 64)
+ * );
+ *
+ * // Access at runtime
+ * auto& set = GroupedConvKernelSetRegistry::instance().get("gconv_fwd");
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <iostream>
+#include <sstream>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace grouped_conv_decl {
+
+// =============================================================================
+// Wildcard constants
+// =============================================================================
+
+constexpr const char* ANY = "*";
+constexpr int ANY_INT     = -1;
+
+// =============================================================================
+// GroupedConvSignature - WHAT operation
+// =============================================================================
+
+class GroupedConvSignature
+{
+    public:
+    std::string dtype_in_        = "fp16";    // Input data type
+    std::string dtype_wei_       = "fp16";    // Weight data type
+    std::string dtype_out_       = "fp16";    // Output data type
+    std::string dtype_acc_       = "fp32";    // Accumulator type
+    std::string dtype_workspace_ = "fp32";    // Workspace type (two-stage algorithms)
+    std::string dtype_bias_      = "fp16";    // Bias type (bias epilogue)
+    std::string layout_          = "nhwc";    // Data layout: nhwc, nchw
+    std::string conv_op_         = "forward"; // forward, bwd_data, bwd_weight
+    int num_dims_                = 2;         // Spatial dimensions: 1, 2, or 3
+    int groups_                  = 1;         // Group grouped convolution
+    std::string specialization_  = "default"; // Filter specialization
+
+    GroupedConvSignature& dtype(const std::string& in,
+                                const std::string& wei,
+                                const std::string& out,
+                                const std::string& acc = "fp32")
+    {
+        dtype_in_  = in;
+        dtype_wei_ = wei;
+        dtype_out_ = out;
+        dtype_acc_ = acc;
+        return *this;
+    }
+
+    GroupedConvSignature& dtype(const std::string& all)
+    {
+        dtype_in_ = dtype_wei_ = dtype_out_ = dtype_bias_ = all;
+        dtype_acc_ = dtype_workspace_ = "fp32";
+        return *this;
+    }
+
+    GroupedConvSignature& dtype_workspace(const std::string& ws)
+    {
+        dtype_workspace_ = ws;
+        return *this;
+    }
+
+    GroupedConvSignature& dtype_bias(const std::string& b)
+    {
+        dtype_bias_ = b;
+        return *this;
+    }
+
+    GroupedConvSignature& layout(const std::string& l)
+    {
+        layout_ = l;
+        return *this;
+    }
+    GroupedConvSignature& conv_type(const std::string& op)
+    {
+        conv_op_ = op;
+        return *this;
+    }
+    GroupedConvSignature& dims(int d)
+    {
+        num_dims_ = d;
+        return *this;
+    }
+    GroupedConvSignature& groups(int g)
+    {
+        groups_ = g;
+        return *this;
+    }
+    GroupedConvSignature& spec(const std::string& s)
+    {
+        specialization_ = s;
+        return *this;
+    }
+
+    std::string op_str() const
+    {
+        if(conv_op_ == "forward")
+            return "fwd";
+        if(conv_op_ == "bwd_data")
+            return "bwd_data";
+        if(conv_op_ == "bwd_weight")
+            return "bwd_weight";
+        return conv_op_;
+    }
+};
+
+// =============================================================================
+// GroupedConvAlgorithm - HOW it's implemented
+// =============================================================================
+
+class GroupedConvAlgorithm
+{
+    public:
+    // Tile shape (M, N, K per tile - M=spatial*N, N=K_out, K=C_in)
+    int tile_m_ = 1;   // Tile M (output spatial * batch)
+    int tile_n_ = 128; // Tile N (output channels K)
+    int tile_k_ = 128; // Tile K (input channels C)
+
+    // Output spatial tile
+    int tile_ho_ = 1;
+    int tile_wo_ = 16;
+
+    // Wave/warp shape
+    int wave_m_ = ANY_INT;
+    int wave_n_ = ANY_INT;
+    int wave_k_ = 1;
+    int warp_m_ = ANY_INT;
+    int warp_n_ = ANY_INT;
+    int warp_k_ = 16;
+
+    // Vector sizes
+    int vector_a_ = 4; // Input vector size
+    int vector_b_ = 8; // Weight vector size
+    int vector_c_ = 8; // Output vector size
+
+    // Pipeline configuration
+    std::string pipeline_  = "compv4";
+    std::string scheduler_ = "intrawave";
+    std::string epilogue_  = "cshuffle";
+    std::string memory_op_ = "set"; // Memory operation: set, atomic_add, atomic_max, add
+
+    // Occupancy/performance hints
+    int block_size_          = 256;
+    int block_per_cu_        = 1;
+    int num_wave_groups_     = 1;
+    int num_groups_to_merge_ = 1;
+    bool double_smem_buffer_ = false;
+
+    // Padding -- always enabled for convolution (MNK padding assumed)
+    static constexpr bool pad_m_ = true;
+    static constexpr bool pad_n_ = true;
+    static constexpr bool pad_k_ = true;
+
+    // Tile setter (M, N, K)
+    GroupedConvAlgorithm& tile(int m, int n, int k)
+    {
+        tile_m_ = m;
+        tile_n_ = n;
+        tile_k_ = k;
+        return *this;
+    }
+
+    GroupedConvAlgorithm& tile_output(int ho, int wo)
+    {
+        tile_ho_ = ho;
+        tile_wo_ = wo;
+        return *this;
+    }
+
+    GroupedConvAlgorithm& wave(int m, int n, int k = 1)
+    {
+        wave_m_ = m;
+        wave_n_ = n;
+        wave_k_ = k;
+        return *this;
+    }
+
+    GroupedConvAlgorithm& warp(int m, int n, int k = 16)
+    {
+        warp_m_ = m;
+        warp_n_ = n;
+        warp_k_ = k;
+        return *this;
+    }
+
+    GroupedConvAlgorithm& vector_sizes(int a, int b, int c)
+    {
+        vector_a_ = a;
+        vector_b_ = b;
+        vector_c_ = c;
+        return *this;
+    }
+
+    GroupedConvAlgorithm& pipeline(const std::string& p)
+    {
+        pipeline_ = p;
+        return *this;
+    }
+    GroupedConvAlgorithm& scheduler(const std::string& s)
+    {
+        scheduler_ = s;
+        return *this;
+    }
+    GroupedConvAlgorithm& epilogue(const std::string& e)
+    {
+        epilogue_ = e;
+        return *this;
+    }
+    GroupedConvAlgorithm& memory_op(const std::string& m)
+    {
+        memory_op_ = m;
+        return *this;
+    }
+
+    // Occupancy setters
+    GroupedConvAlgorithm& block_per_cu(int b)
+    {
+        block_per_cu_ = b;
+        return *this;
+    }
+    GroupedConvAlgorithm& num_wave_groups(int n)
+    {
+        num_wave_groups_ = n;
+        return *this;
+    }
+    GroupedConvAlgorithm& num_groups_to_merge(int n)
+    {
+        num_groups_to_merge_ = n;
+        return *this;
+    }
+    GroupedConvAlgorithm& double_smem_buffer(bool d)
+    {
+        double_smem_buffer_ = d;
+        return *this;
+    }
+
+    bool needs_expansion() const
+    {
+        return wave_m_ == ANY_INT || warp_m_ == ANY_INT || pipeline_ == "*" || scheduler_ == "*";
+    }
+
+    /// Check if specific parameter needs expansion
+    bool needs_wave_expansion() const { return wave_m_ == ANY_INT || wave_n_ == ANY_INT; }
+    bool needs_warp_expansion() const { return warp_m_ == ANY_INT || warp_n_ == ANY_INT; }
+    bool needs_pipeline_expansion() const { return pipeline_ == "*"; }
+    bool needs_scheduler_expansion() const { return scheduler_ == "*"; }
+
+    /// Auto-fill with defaults (for single kernel generation)
+    void auto_fill()
+    {
+        if(wave_m_ == ANY_INT)
+            wave_m_ = 2;
+        if(wave_n_ == ANY_INT)
+            wave_n_ = 2;
+        if(warp_m_ == ANY_INT)
+            warp_m_ = 32;
+        if(warp_n_ == ANY_INT)
+            warp_n_ = 32;
+        if(pipeline_ == "*")
+            pipeline_ = "compv4";
+        if(scheduler_ == "*")
+            scheduler_ = "intrawave";
+    }
+
+    /// Get all valid wave configurations for arch
+    static std::vector<std::tuple<int, int, int>> valid_wave_configs(const std::string& arch)
+    {
+        // Match arch_specs_generated.py WARP_SUPPORTED_COMBINATIONS
+        if(arch == "gfx942" || arch == "gfx90a" || arch == "gfx950")
+        {
+            return {{1, 4, 1}, {2, 2, 1}, {4, 1, 1}};
+        }
+        return {{2, 2, 1}}; // Default
+    }
+
+    /// Get all valid warp tile configurations
+    static std::vector<std::tuple<int, int, int>> valid_warp_configs(const std::string& arch,
+                                                                     const std::string& dtype)
+    {
+        // Match arch_specs_generated.py WARP_TILE_SUPPORTED_COMBINATIONS
+        if(arch == "gfx942" && (dtype == "fp16" || dtype == "bf16"))
+        {
+            return {{16, 16, 16}, {32, 32, 16}};
+        }
+        return {{32, 32, 16}}; // Default
+    }
+
+    /// Get all valid pipeline/scheduler combinations for forward conv.
+    /// Backward operations (bwd_data/bwd_weight) only support compv3 and mem
+    /// due to transpose_tile2d and get_length constraints in CK Tile.
+    static std::vector<std::pair<std::string, std::string>> valid_trait_configs()
+    {
+        return {
+            {"compv3", "intrawave"},
+            {"compv4", "intrawave"},
+            {"compv5", "intrawave"},
+            {"mem", "intrawave"},
+            {"mem", "interwave"},
+        };
+    }
+};
+
+// =============================================================================
+// GroupedConvKernelDecl
+// =============================================================================
+
+struct GroupedConvKernelDecl
+{
+    GroupedConvSignature signature;
+    GroupedConvAlgorithm algorithm;
+    std::string arch = "gfx942";
+
+    GroupedConvKernelDecl() = default;
+
+    GroupedConvKernelDecl(const GroupedConvSignature& sig,
+                          const GroupedConvAlgorithm& algo,
+                          const std::string& a = "gfx942")
+        : signature(sig), algorithm(algo), arch(a)
+    {
+    }
+
+    std::string name() const
+    {
+        std::ostringstream oss;
+        // Generate full kernel name similar to GEMM:
+        // grouped_conv_<op>_<dtype>_<layout>_<ndim>d_<pipeline>_<epilogue>_<scheduler>_<tile>_<wave>_<warp>
+        oss << "grouped_conv_" << signature.op_str() << "_" << signature.dtype_in_ << "_"
+            << signature.layout_ << "_" << signature.num_dims_ << "d" << "_" << algorithm.pipeline_
+            << "_" << algorithm.epilogue_ << "_" << algorithm.scheduler_ << "_" << algorithm.tile_m_
+            << "x" << algorithm.tile_n_ << "x" << algorithm.tile_k_ << "_" << algorithm.wave_m_
+            << "x" << algorithm.wave_n_ << "x" << algorithm.wave_k_ << "_" << algorithm.warp_m_
+            << "x" << algorithm.warp_n_ << "x" << algorithm.warp_k_;
+        return oss.str();
+    }
+
+    bool has_wildcards() const { return algorithm.needs_expansion() || arch == "*"; }
+};
+
+// =============================================================================
+// GroupedConvKernelSet
+// =============================================================================
+
+class GroupedConvKernelSet
+{
+    public:
+    GroupedConvKernelSet() = default;
+
+    GroupedConvKernelSet& add(const GroupedConvSignature& sig,
+                              const GroupedConvAlgorithm& algo,
+                              const std::string& arch = "gfx942")
+    {
+        decls_.emplace_back(sig, algo, arch);
+        return *this;
+    }
+
+    // Simple add: dtype, layout, conv_type, tile_k, tile_c
+    GroupedConvKernelSet& add(const std::string& dtype,
+                              const std::string& layout,
+                              const std::string& conv_type,
+                              int tile_k,
+                              int tile_c,
+                              const std::string& arch = "gfx942")
+    {
+        GroupedConvSignature sig;
+        sig.dtype(dtype).layout(layout).conv_type(conv_type);
+        GroupedConvAlgorithm algo;
+        algo.tile(1, tile_k, tile_c);
+        decls_.emplace_back(sig, algo, arch);
+        return *this;
+    }
+
+    GroupedConvKernelSet& merge(const GroupedConvKernelSet& other)
+    {
+        decls_.insert(decls_.end(), other.decls_.begin(), other.decls_.end());
+        return *this;
+    }
+
+    const std::vector<GroupedConvKernelDecl>& declarations() const { return decls_; }
+    size_t size() const { return decls_.size(); }
+
+    void print(std::ostream& os = std::cout) const
+    {
+        os << "GroupedConvKernelSet (" << size() << " declarations):\n";
+        for(const auto& d : decls_)
+        {
+            os << "  - " << d.name();
+            if(d.algorithm.needs_expansion())
+                os << " [expands]";
+            os << "\n";
+        }
+    }
+
+    GroupedConvKernelSet& tag(const std::string& t)
+    {
+        tag_ = t;
+        return *this;
+    }
+    std::string tag() const { return tag_; }
+
+    private:
+    std::vector<GroupedConvKernelDecl> decls_;
+    std::string tag_;
+};
+
+// =============================================================================
+// GroupedConvKernelSetRegistry
+// =============================================================================
+
+class GroupedConvKernelSetRegistry
+{
+    public:
+    static GroupedConvKernelSetRegistry& instance()
+    {
+        static GroupedConvKernelSetRegistry reg;
+        return reg;
+    }
+
+    void add(const std::string& name, const GroupedConvKernelSet& set)
+    {
+        sets_[name] = set;
+        if(std::find(order_.begin(), order_.end(), name) == order_.end())
+        {
+            order_.push_back(name);
+        }
+    }
+
+    // Alias for add() for consistency with GEMM API
+    void register_set(const std::string& name, const GroupedConvKernelSet& set) { add(name, set); }
+
+    const GroupedConvKernelSet& get(const std::string& name) const
+    {
+        static GroupedConvKernelSet empty;
+        auto it = sets_.find(name);
+        return it != sets_.end() ? it->second : empty;
+    }
+
+    bool has(const std::string& name) const { return sets_.find(name) != sets_.end(); }
+
+    std::vector<std::string> names() const { return order_; }
+    size_t size() const { return sets_.size(); }
+
+    void clear()
+    {
+        sets_.clear();
+        order_.clear();
+    }
+
+    void print() const
+    {
+        std::cout << "Grouped Conv Kernel Sets (" << size() << "):\n";
+        for(const auto& name : order_)
+        {
+            const auto& set = sets_.at(name);
+            std::cout << "  " << name << ": " << set.size() << " declarations\n";
+        }
+    }
+
+    private:
+    GroupedConvKernelSetRegistry() = default;
+    std::unordered_map<std::string, GroupedConvKernelSet> sets_;
+    std::vector<std::string> order_;
+};
+
+// =============================================================================
+// Static Registrar
+// =============================================================================
+
+struct GroupedConvKernelSetRegistrar
+{
+    GroupedConvKernelSetRegistrar(const std::string& name, const GroupedConvKernelSet& set)
+    {
+        GroupedConvKernelSetRegistry::instance().add(name, set);
+    }
+};
+
+} // namespace grouped_conv_decl
+
+// Convenience aliases
+using GroupedConvSignature         = grouped_conv_decl::GroupedConvSignature;
+using GroupedConvAlgorithm         = grouped_conv_decl::GroupedConvAlgorithm;
+using GroupedConvKernelDecl        = grouped_conv_decl::GroupedConvKernelDecl;
+using GroupedConvKernelSet         = grouped_conv_decl::GroupedConvKernelSet;
+using GroupedConvKernelSetRegistry = grouped_conv_decl::GroupedConvKernelSetRegistry;
+
+} // namespace dispatcher
+} // namespace ck_tile
+
+// =============================================================================
+// Declaration Macros
+// =============================================================================
+
+#define CK_GROUPED_CONV_DECL_CAT_(a, b) CK_GROUPED_CONV_DECL_CAT_IMPL_(a, b)
+#define CK_GROUPED_CONV_DECL_CAT_IMPL_(a, b) a##b
+
+// Note: __extension__ suppresses warnings about __COUNTER__ being a GCC/Clang extension
+#define DECL_GROUPED_CONV_KERNEL_SET(name, ...)                                                  \
+    __extension__ static ::ck_tile::dispatcher::grouped_conv_decl::GroupedConvKernelSetRegistrar \
+    CK_GROUPED_CONV_DECL_CAT_(_gconv_kset_reg_, __COUNTER__)(                                    \
+        #name,                                                                                   \
+        ::ck_tile::dispatcher::grouped_conv_decl::GroupedConvKernelSet() __VA_ARGS__.tag(#name))
+
+#define DECL_GROUPED_CONV_KERNEL_ALL(dtype, layout)                                                \
+    __extension__ static ::ck_tile::dispatcher::grouped_conv_decl::GroupedConvKernelSetRegistrar   \
+    CK_GROUPED_CONV_DECL_CAT_(_gconv_kset_reg_, __COUNTER__)(                                      \
+        #dtype "_" #layout "_all",                                                                 \
+        ::ck_tile::dispatcher::grouped_conv_decl::GroupedConvKernelSet().add(                      \
+            ::ck_tile::dispatcher::grouped_conv_decl::GroupedConvSignature().dtype(#dtype).layout( \
+                #layout),                                                                          \
+            ::ck_tile::dispatcher::grouped_conv_decl::GroupedConvAlgorithm(),                      \
+            "*"))
+
+#define GROUPED_CONV_KERNEL_SET(name) \
+    ::ck_tile::dispatcher::grouped_conv_decl::GroupedConvKernelSet name
+#define BEGIN_GROUPED_CONV_KERNEL_SET() \
+    ::ck_tile::dispatcher::grouped_conv_decl::GroupedConvKernelSet()
diff --git a/dispatcher/include/ck_tile/dispatcher/grouped_conv_problem.hpp b/dispatcher/include/ck_tile/dispatcher/grouped_conv_problem.hpp
new file mode 100644
index 0000000000..5b58f37206
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/grouped_conv_problem.hpp
@@ -0,0 +1,255 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file grouped_conv_problem.hpp
+ * @brief Grouped Convolution problem definition
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <array>
+#include <stdexcept>
+#include <string>
+
+namespace ck_tile {
+namespace dispatcher {
+
+/**
+ * @brief Grouped Convolution operation type
+ */
+enum class GroupedConvOp
+{
+    Forward,       // Y = Conv(X, W)
+    BackwardData,  // dX = ConvBwdData(dY, W)
+    BackwardWeight // dW = ConvBwdWeight(X, dY)
+};
+
+/**
+ * @brief Grouped Convolution problem specification
+ */
+struct GroupedConvProblem
+{
+    // Batch and channels
+    std::int64_t N; // Batch size
+    std::int64_t C; // Input channels
+    std::int64_t K; // Output channels (filters)
+    std::int64_t G; // Number of groups (1 for standard conv)
+
+    // Spatial dimensions (supports 1D, 2D, 3D)
+    std::array<std::int64_t, 3> input_spatial;  // {D, H, W} or {1, H, W} for 2D
+    std::array<std::int64_t, 3> filter_spatial; // {Z, Y, X} or {1, Y, X} for 2D
+    std::array<std::int64_t, 3> output_spatial; // {Do, Ho, Wo} or {1, Ho, Wo} for 2D
+
+    // Convolution parameters
+    std::array<std::int64_t, 3> stride;   // Stride in each dimension
+    std::array<std::int64_t, 3> padding;  // Padding in each dimension
+    std::array<std::int64_t, 3> dilation; // Dilation in each dimension
+
+    // Operation type
+    GroupedConvOp op = GroupedConvOp::Forward;
+
+    // Split-K for backward weight (k_batch parameter in CK Tile).
+    // Values > 1 split the reduction dimension across multiple thread blocks
+    // and use atomic accumulation.
+    int split_k = 1;
+
+    // Default constructor for 2D convolution
+    GroupedConvProblem()
+        : N(1),
+          C(64),
+          K(64),
+          G(1),
+          input_spatial{1, 28, 28},
+          filter_spatial{1, 3, 3},
+          output_spatial{1, 26, 26},
+          stride{1, 1, 1},
+          padding{0, 0, 0},
+          dilation{1, 1, 1},
+          op(GroupedConvOp::Forward)
+    {
+    }
+
+    // Constructor for 2D convolution
+    GroupedConvProblem(std::int64_t n,
+                       std::int64_t c,
+                       std::int64_t k,
+                       std::int64_t hi,
+                       std::int64_t wi,
+                       std::int64_t y,
+                       std::int64_t x,
+                       std::int64_t stride_h   = 1,
+                       std::int64_t stride_w   = 1,
+                       std::int64_t pad_h      = 0,
+                       std::int64_t pad_w      = 0,
+                       std::int64_t dilation_h = 1,
+                       std::int64_t dilation_w = 1)
+        : N(n),
+          C(c),
+          K(k),
+          G(1),
+          input_spatial{1, hi, wi},
+          filter_spatial{1, y, x},
+          stride{1, stride_h, stride_w},
+          padding{0, pad_h, pad_w},
+          dilation{1, dilation_h, dilation_w},
+          op(GroupedConvOp::Forward)
+    {
+        compute_output_size();
+    }
+
+    /// Check if problem dimensions are valid
+    bool is_valid() const
+    {
+        return N > 0 && C > 0 && K > 0 && G > 0 && (C % G == 0) && (K % G == 0);
+    }
+
+    /// Compute output spatial dimensions
+    void compute_output_size()
+    {
+        for(int i = 0; i < 3; ++i)
+        {
+            std::int64_t effective_filter = (filter_spatial[i] - 1) * dilation[i] + 1;
+            output_spatial[i] =
+                (input_spatial[i] + 2 * padding[i] - effective_filter) / stride[i] + 1;
+        }
+    }
+
+    /// Get 2D height/width accessors
+    std::int64_t Hi() const { return input_spatial[1]; }
+    std::int64_t Wi() const { return input_spatial[2]; }
+    std::int64_t Ho() const { return output_spatial[1]; }
+    std::int64_t Wo() const { return output_spatial[2]; }
+    std::int64_t Y() const { return filter_spatial[1]; } // Filter height
+    std::int64_t X() const { return filter_spatial[2]; } // Filter width
+
+    /// Get total FLOPs for this convolution
+    double get_flops() const
+    {
+        // Forward: 2 * N * K * Ho * Wo * C * Y * X / G
+        double spatial_out = 1.0;
+        double filter_size = 1.0;
+        for(int i = 0; i < 3; ++i)
+        {
+            spatial_out *= output_spatial[i];
+            filter_size *= filter_spatial[i];
+        }
+        return 2.0 * N * K * spatial_out * (C / G) * filter_size;
+    }
+
+    /// Check if this is a depthwise convolution
+    bool is_depthwise() const { return G == C && G == K; }
+
+    /// Check if this is a pointwise (1x1) convolution
+    bool is_pointwise() const
+    {
+        return filter_spatial[0] == 1 && filter_spatial[1] == 1 && filter_spatial[2] == 1;
+    }
+
+    /// String representation
+    std::string to_string() const
+    {
+        std::string s = "GroupedConvProblem(N=" + std::to_string(N);
+        s += ", C=" + std::to_string(C) + ", K=" + std::to_string(K);
+        s += ", G=" + std::to_string(G);
+        s += ", Hi=" + std::to_string(Hi()) + ", Wi=" + std::to_string(Wi());
+        s += ", Y=" + std::to_string(Y()) + ", X=" + std::to_string(X());
+        s += ", Ho=" + std::to_string(Ho()) + ", Wo=" + std::to_string(Wo());
+        s += ")";
+        return s;
+    }
+};
+
+// =============================================================================
+// GroupedConvProblemBuilder
+// =============================================================================
+
+/// Builder pattern for Grouped Convolution problem configuration
+class GroupedConvProblemBuilder
+{
+    public:
+    GroupedConvProblemBuilder() = default;
+
+    GroupedConvProblemBuilder& batch(std::int64_t n)
+    {
+        problem_.N = n;
+        return *this;
+    }
+
+    GroupedConvProblemBuilder& channels(std::int64_t c, std::int64_t k)
+    {
+        problem_.C = c;
+        problem_.K = k;
+        return *this;
+    }
+
+    GroupedConvProblemBuilder& groups(std::int64_t g)
+    {
+        problem_.G = g;
+        return *this;
+    }
+
+    GroupedConvProblemBuilder& input_size(std::int64_t h, std::int64_t w)
+    {
+        problem_.input_spatial[0] = 1;
+        problem_.input_spatial[1] = h;
+        problem_.input_spatial[2] = w;
+        return *this;
+    }
+
+    GroupedConvProblemBuilder& filter_size(std::int64_t y, std::int64_t x)
+    {
+        problem_.filter_spatial[0] = 1;
+        problem_.filter_spatial[1] = y;
+        problem_.filter_spatial[2] = x;
+        return *this;
+    }
+
+    GroupedConvProblemBuilder& stride(std::int64_t sh, std::int64_t sw)
+    {
+        problem_.stride[0] = 1;
+        problem_.stride[1] = sh;
+        problem_.stride[2] = sw;
+        return *this;
+    }
+
+    GroupedConvProblemBuilder& padding(std::int64_t ph, std::int64_t pw)
+    {
+        problem_.padding[0] = 0;
+        problem_.padding[1] = ph;
+        problem_.padding[2] = pw;
+        return *this;
+    }
+
+    GroupedConvProblemBuilder& dilation(std::int64_t dh, std::int64_t dw)
+    {
+        problem_.dilation[0] = 1;
+        problem_.dilation[1] = dh;
+        problem_.dilation[2] = dw;
+        return *this;
+    }
+
+    GroupedConvProblemBuilder& operation(GroupedConvOp op)
+    {
+        problem_.op = op;
+        return *this;
+    }
+
+    [[nodiscard]] GroupedConvProblem build() const
+    {
+        GroupedConvProblem p = problem_;
+        p.compute_output_size();
+        if(!p.is_valid())
+        {
+            throw std::invalid_argument("Invalid grouped convolution problem dimensions");
+        }
+        return p;
+    }
+
+    private:
+    GroupedConvProblem problem_;
+};
+
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/include/ck_tile/dispatcher/grouped_conv_registry.hpp b/dispatcher/include/ck_tile/dispatcher/grouped_conv_registry.hpp
new file mode 100644
index 0000000000..42698a0bc8
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/grouped_conv_registry.hpp
@@ -0,0 +1,614 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file grouped_conv_registry.hpp
+ * @brief Grouped Convolution kernel registry and dispatcher
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <functional>
+#include <memory>
+#include <stdexcept>
+#include <mutex>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <map>
+
+#include "ck_tile/dispatcher/base_registry.hpp"
+#include "ck_tile/dispatcher/dispatcher_error.hpp"
+#include "ck_tile/dispatcher/grouped_conv_problem.hpp"
+#include "ck_tile/dispatcher/grouped_conv_kernel_decl.hpp"
+
+namespace ck_tile {
+namespace dispatcher {
+
+// =============================================================================
+// Thread-local buffer context for GroupedConvDispatcher::run()
+// The generated conv backend RunFn reads these to get buffer pointers.
+// =============================================================================
+
+struct ConvDispatchBuffers
+{
+    const void* input_ptr  = nullptr;
+    const void* weight_ptr = nullptr;
+    void* output_ptr       = nullptr;
+    int warmup             = 3;
+    int repeat             = 10;
+    bool benchmarking      = true;
+    int split_k            = 1;
+};
+
+inline thread_local ConvDispatchBuffers g_conv_dispatch_buffers;
+
+// =============================================================================
+// GroupedConvKernelKey - Unique identifier for a grouped convolution kernel
+// =============================================================================
+
+struct GroupedConvKernelKey
+{
+    // Signature fields
+    std::string dtype_in;
+    std::string dtype_wei;
+    std::string dtype_out;
+    std::string layout;   // e.g., "nhwgc"
+    int ndim_spatial = 2; // 1, 2, or 3
+    GroupedConvOp op = GroupedConvOp::Forward;
+
+    // Tile configuration
+    int tile_m = 1;
+    int tile_n = 128;
+    int tile_k = 128;
+
+    // Wave/warp configuration
+    int wave_m = 2;
+    int wave_n = 2;
+    int wave_k = 1;
+    int warp_m = 32;
+    int warp_n = 32;
+    int warp_k = 16;
+
+    // Pipeline
+    std::string pipeline  = "compv3";
+    std::string scheduler = "intrawave";
+    std::string epilogue  = "cshuffle";
+
+    // ConvConfigBase parity fields
+    int vector_size_a       = 4;
+    int vector_size_b       = 8;
+    int vector_size_c       = 8;
+    int block_per_cu        = 1;
+    int num_wave_groups     = 1;
+    int num_groups_to_merge = 1;
+
+    // GPU architecture (for filter_by_arch)
+    std::string arch = "gfx942";
+
+    bool operator==(const GroupedConvKernelKey& other) const
+    {
+        return dtype_in == other.dtype_in && dtype_wei == other.dtype_wei &&
+               dtype_out == other.dtype_out && layout == other.layout &&
+               ndim_spatial == other.ndim_spatial && op == other.op && tile_m == other.tile_m &&
+               tile_n == other.tile_n && tile_k == other.tile_k && wave_m == other.wave_m &&
+               wave_n == other.wave_n && wave_k == other.wave_k && warp_m == other.warp_m &&
+               warp_n == other.warp_n && warp_k == other.warp_k && pipeline == other.pipeline &&
+               scheduler == other.scheduler && epilogue == other.epilogue &&
+               vector_size_a == other.vector_size_a && vector_size_b == other.vector_size_b &&
+               vector_size_c == other.vector_size_c && block_per_cu == other.block_per_cu &&
+               num_wave_groups == other.num_wave_groups &&
+               num_groups_to_merge == other.num_groups_to_merge && arch == other.arch;
+    }
+
+    std::string to_string() const
+    {
+        std::string op_str;
+        switch(op)
+        {
+        case GroupedConvOp::Forward: op_str = "fwd"; break;
+        case GroupedConvOp::BackwardData: op_str = "bwd_data"; break;
+        case GroupedConvOp::BackwardWeight: op_str = "bwd_weight"; break;
+        }
+        return "grouped_conv_" + op_str + "_" + dtype_in + "_" + std::to_string(ndim_spatial) +
+               "d_" + std::to_string(tile_m) + "x" + std::to_string(tile_n) + "x" +
+               std::to_string(tile_k) + "_" + std::to_string(wave_m) + "x" +
+               std::to_string(wave_n) + "x" + std::to_string(wave_k) + "_" +
+               std::to_string(warp_m) + "x" + std::to_string(warp_n) + "x" +
+               std::to_string(warp_k) + "_" + pipeline;
+    }
+};
+
+struct GroupedConvKernelKeyHash
+{
+    std::size_t operator()(const GroupedConvKernelKey& key) const
+    {
+        std::size_t h = std::hash<std::string>{}(key.dtype_in);
+        h ^= std::hash<std::string>{}(key.layout) << 1;
+        h ^= std::hash<int>{}(key.ndim_spatial) << 2;
+        h ^= std::hash<int>{}(static_cast<int>(key.op)) << 3;
+        h ^= std::hash<int>{}(key.tile_m) << 4;
+        h ^= std::hash<int>{}(key.tile_n) << 5;
+        h ^= std::hash<int>{}(key.tile_k) << 6;
+        h ^= std::hash<int>{}(key.wave_m) << 7;
+        h ^= std::hash<int>{}(key.wave_n) << 8;
+        h ^= std::hash<int>{}(key.warp_m) << 9;
+        h ^= std::hash<int>{}(key.warp_n) << 10;
+        h ^= std::hash<std::string>{}(key.pipeline) << 11;
+        h ^= std::hash<std::string>{}(key.arch) << 12;
+        return h;
+    }
+};
+
+// =============================================================================
+// GroupedConvKernelInstance - Runtime representation of a kernel
+// =============================================================================
+
+// Forward declaration for shared_ptr type alias
+class GroupedConvKernelInstance;
+using GroupedConvKernelInstancePtr = std::shared_ptr<GroupedConvKernelInstance>;
+
+class GroupedConvKernelInstance
+{
+    public:
+    using RunFn = std::function<float(const GroupedConvProblem&, void*)>;
+
+    GroupedConvKernelInstance(const GroupedConvKernelKey& key,
+                              const std::string& name,
+                              RunFn run_fn)
+        : key_(key), name_(name), run_fn_(std::move(run_fn))
+    {
+    }
+
+    const GroupedConvKernelKey& key() const { return key_; }
+    const std::string& name() const { return name_; }
+
+    float run(const GroupedConvProblem& problem, void* stream = nullptr) const
+    {
+        return run_fn_(problem, stream);
+    }
+
+    bool matches(const GroupedConvProblem& problem) const
+    {
+        // Check if this kernel can handle the problem
+        return problem.op == key_.op;
+    }
+
+    private:
+    GroupedConvKernelKey key_;
+    std::string name_;
+    RunFn run_fn_;
+};
+
+// =============================================================================
+// GroupedConvRegistry - Stores and manages grouped convolution kernels
+// =============================================================================
+
+class GroupedConvRegistry : public BaseRegistry<GroupedConvRegistry,
+                                                GroupedConvKernelKey,
+                                                GroupedConvKernelInstance,
+                                                GroupedConvKernelKeyHash>
+{
+    using Base = BaseRegistry<GroupedConvRegistry,
+                              GroupedConvKernelKey,
+                              GroupedConvKernelInstance,
+                              GroupedConvKernelKeyHash>;
+
+    public:
+    GroupedConvRegistry() = default;
+
+    /// Singleton instance for global kernel registration
+    static GroupedConvRegistry& instance()
+    {
+        static GroupedConvRegistry registry;
+        return registry;
+    }
+
+    /// Register kernels from a GroupedConvKernelSet (atomic batch registration)
+    bool register_set(const GroupedConvKernelSet& kernel_set, Priority priority = Priority::Normal)
+    {
+        // Build all instances first, then register under a single lock hold
+        // so readers never see a half-registered set.
+        std::vector<std::pair<GroupedConvKernelKey, std::shared_ptr<GroupedConvKernelInstance>>>
+            batch;
+        batch.reserve(kernel_set.declarations().size());
+
+        for(const auto& decl : kernel_set.declarations())
+        {
+            GroupedConvKernelKey key;
+            key.dtype_in        = decl.signature.dtype_in_;
+            key.dtype_wei       = decl.signature.dtype_wei_;
+            key.dtype_out       = decl.signature.dtype_out_;
+            key.layout          = decl.signature.layout_;
+            key.ndim_spatial    = decl.signature.num_dims_;
+            key.op              = (decl.signature.conv_op_ == "forward") ? GroupedConvOp::Forward
+                                  : (decl.signature.conv_op_ == "bwd_data") ? GroupedConvOp::BackwardData
+                                                                            : GroupedConvOp::BackwardWeight;
+            key.tile_m          = decl.algorithm.tile_m_;
+            key.tile_n          = decl.algorithm.tile_n_;
+            key.tile_k          = decl.algorithm.tile_k_;
+            key.wave_m          = decl.algorithm.wave_m_;
+            key.wave_n          = decl.algorithm.wave_n_;
+            key.wave_k          = decl.algorithm.wave_k_;
+            key.warp_m          = decl.algorithm.warp_m_;
+            key.warp_n          = decl.algorithm.warp_n_;
+            key.warp_k          = decl.algorithm.warp_k_;
+            key.pipeline        = decl.algorithm.pipeline_;
+            key.scheduler       = decl.algorithm.scheduler_;
+            key.epilogue        = decl.algorithm.epilogue_;
+            key.vector_size_a   = decl.algorithm.vector_a_;
+            key.vector_size_b   = decl.algorithm.vector_b_;
+            key.vector_size_c   = decl.algorithm.vector_c_;
+            key.block_per_cu    = decl.algorithm.block_per_cu_;
+            key.num_wave_groups = decl.algorithm.num_wave_groups_;
+            key.num_groups_to_merge = decl.algorithm.num_groups_to_merge_;
+            key.arch                = decl.arch;
+
+            batch.emplace_back(key,
+                               std::make_shared<GroupedConvKernelInstance>(
+                                   key, decl.name(), [](const GroupedConvProblem&, void*) -> float {
+                                       return 0.0f;
+                                   }));
+        }
+
+        std::lock_guard<std::mutex> lock(mutex());
+        bool any_registered = false;
+        for(auto& [key, instance] : batch)
+        {
+            auto it = entries().find(key);
+            if(it == entries().end() || it->second.priority <= priority)
+            {
+                entries_mut()[key] = typename Base::Entry{std::move(instance), priority};
+                any_registered     = true;
+            }
+        }
+        return any_registered;
+    }
+
+    /// Find the best kernel for a problem
+    const GroupedConvKernelInstance* find(const GroupedConvProblem& problem) const
+    {
+        std::lock_guard<std::mutex> lock(mutex());
+        const GroupedConvKernelInstance* best = nullptr;
+        Priority best_priority                = Priority::Low;
+
+        for(const auto& [key, entry] : entries())
+        {
+            if(entry.instance->matches(problem))
+            {
+                if(!best || entry.priority > best_priority)
+                {
+                    best          = entry.instance.get();
+                    best_priority = entry.priority;
+                }
+            }
+        }
+
+        return best;
+    }
+
+    /// Get all registered kernels
+    std::vector<const GroupedConvKernelInstance*> all_kernels() const
+    {
+        std::lock_guard<std::mutex> lock(mutex());
+        std::vector<const GroupedConvKernelInstance*> result;
+        for(const auto& [key, entry] : entries())
+        {
+            result.push_back(entry.instance.get());
+        }
+        return result;
+    }
+
+    /// Export registry to JSON string
+    std::string export_json(bool include_statistics = false) const
+    {
+        // Note: get_name() acquires the mutex internally, so we must NOT hold
+        // the registry mutex here (std::mutex is not recursive).
+        std::string reg_name = get_name();
+
+        std::lock_guard<std::mutex> lock(mutex());
+        std::ostringstream json;
+
+        json << "{\n";
+        json << "  \"metadata\": {\n";
+        json << "    \"registry_name\": \"" << json_escape(reg_name) << "\",\n";
+        json << "    \"total_kernels\": " << entries().size() << "\n";
+        json << "  }";
+
+        if(include_statistics && !entries().empty())
+        {
+            std::map<std::string, int> by_datatype;
+            std::map<std::string, int> by_pipeline;
+            std::map<std::string, int> by_arch;
+
+            for(const auto& [key, entry] : entries())
+            {
+                std::string dtype_key = key.dtype_in + "_" + key.dtype_wei + "_" + key.dtype_out;
+                by_datatype[dtype_key]++;
+                by_pipeline[key.pipeline]++;
+                by_arch[key.arch]++;
+            }
+
+            json << ",\n  \"statistics\": {\n";
+            json << "    \"by_datatype\": {";
+            bool first = true;
+            for(const auto& [dtype, count] : by_datatype)
+            {
+                if(!first)
+                    json << ",";
+                json << "\"" << json_escape(dtype) << "\":" << count;
+                first = false;
+            }
+            json << "},\n";
+            json << "    \"by_pipeline\": {";
+            first = true;
+            for(const auto& [pipeline, count] : by_pipeline)
+            {
+                if(!first)
+                    json << ",";
+                json << "\"" << json_escape(pipeline) << "\":" << count;
+                first = false;
+            }
+            json << "},\n";
+            json << "    \"by_arch\": {";
+            first = true;
+            for(const auto& [arch, count] : by_arch)
+            {
+                if(!first)
+                    json << ",";
+                json << "\"" << json_escape(arch) << "\":" << count;
+                first = false;
+            }
+            json << "}\n  }";
+        }
+
+        json << ",\n  \"kernels\": [\n";
+        bool first = true;
+        for(const auto& [key, entry] : entries())
+        {
+            if(!first)
+                json << ",\n";
+            json << "    " << export_kernel_json(*entry.instance);
+            first = false;
+        }
+        json << "\n  ]\n";
+        json << "}\n";
+
+        return json.str();
+    }
+
+    /// Export registry to JSON file
+    void export_json_to_file(const std::string& filename, bool include_statistics = false) const
+    {
+        std::string json_str = export_json(include_statistics);
+        std::ofstream file(filename);
+        if(!file.is_open())
+        {
+            throw std::runtime_error("Failed to open file for export: " + filename);
+        }
+        file << json_str;
+    }
+
+    /// Get kernels matching a predicate
+    std::vector<const GroupedConvKernelInstance*>
+    filter(std::function<bool(const GroupedConvKernelInstance&)> predicate) const
+    {
+        std::lock_guard<std::mutex> lock(mutex());
+        std::vector<const GroupedConvKernelInstance*> result;
+        for(const auto& [key, entry] : entries())
+        {
+            if(predicate(*entry.instance))
+            {
+                result.push_back(entry.instance.get());
+            }
+        }
+        return result;
+    }
+
+    /// Remove kernels not matching the arch
+    std::size_t filter_by_arch(const std::string& gpu_arch)
+    {
+        std::lock_guard<std::mutex> lock(mutex());
+        std::vector<GroupedConvKernelKey> to_remove;
+        for(const auto& [key, entry] : entries())
+        {
+            if(key.arch != gpu_arch)
+            {
+                to_remove.push_back(key);
+            }
+        }
+        for(const auto& key : to_remove)
+        {
+            entries_mut().erase(key);
+        }
+        return to_remove.size();
+    }
+
+    private:
+    static std::string json_escape(const std::string& str)
+    {
+        std::ostringstream oss;
+        for(char c : str)
+        {
+            switch(c)
+            {
+            case '"': oss << "\\\""; break;
+            case '\\': oss << "\\\\"; break;
+            case '\b': oss << "\\b"; break;
+            case '\f': oss << "\\f"; break;
+            case '\n': oss << "\\n"; break;
+            case '\r': oss << "\\r"; break;
+            case '\t': oss << "\\t"; break;
+            default:
+                if(c < 0x20)
+                {
+                    oss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << (int)c;
+                }
+                else
+                {
+                    oss << c;
+                }
+            }
+        }
+        return oss.str();
+    }
+
+    static std::string export_kernel_json(const GroupedConvKernelInstance& kernel)
+    {
+        std::ostringstream json;
+        const auto& key = kernel.key();
+
+        std::string op_str;
+        switch(key.op)
+        {
+        case GroupedConvOp::Forward: op_str = "fwd"; break;
+        case GroupedConvOp::BackwardData: op_str = "bwd_data"; break;
+        case GroupedConvOp::BackwardWeight: op_str = "bwd_weight"; break;
+        }
+
+        json << "{\n";
+        json << "      \"name\": \"" << json_escape(kernel.name()) << "\",\n";
+        json << "      \"signature\": {\n";
+        json << "        \"dtype_in\": \"" << json_escape(key.dtype_in) << "\",\n";
+        json << "        \"dtype_wei\": \"" << json_escape(key.dtype_wei) << "\",\n";
+        json << "        \"dtype_out\": \"" << json_escape(key.dtype_out) << "\",\n";
+        json << "        \"layout\": \"" << json_escape(key.layout) << "\",\n";
+        json << "        \"ndim_spatial\": " << key.ndim_spatial << ",\n";
+        json << "        \"op\": \"" << op_str << "\"\n";
+        json << "      },\n";
+        json << "      \"algorithm\": {\n";
+        json << "        \"tile_m\": " << key.tile_m << ",\n";
+        json << "        \"tile_n\": " << key.tile_n << ",\n";
+        json << "        \"tile_k\": " << key.tile_k << ",\n";
+        json << "        \"wave\": \"" << key.wave_m << "x" << key.wave_n << "x" << key.wave_k
+             << "\",\n";
+        json << "        \"warp\": \"" << key.warp_m << "x" << key.warp_n << "x" << key.warp_k
+             << "\",\n";
+        json << "        \"pipeline\": \"" << json_escape(key.pipeline) << "\",\n";
+        json << "        \"scheduler\": \"" << json_escape(key.scheduler) << "\",\n";
+        json << "        \"epilogue\": \"" << json_escape(key.epilogue) << "\",\n";
+        json << "        \"vector_sizes\": [" << key.vector_size_a << "," << key.vector_size_b
+             << "," << key.vector_size_c << "],\n";
+        json << "        \"block_per_cu\": " << key.block_per_cu << ",\n";
+        json << "        \"num_wave_groups\": " << key.num_wave_groups << ",\n";
+        json << "        \"num_groups_to_merge\": " << key.num_groups_to_merge << "\n";
+        json << "      },\n";
+        json << "      \"arch\": \"" << json_escape(key.arch) << "\"\n";
+        json << "    }";
+
+        return json.str();
+    }
+};
+
+// =============================================================================
+// GroupedConvDispatcher - Selects and runs the best kernel for a problem
+// =============================================================================
+
+class GroupedConvDispatcher
+{
+    public:
+    enum class SelectionStrategy
+    {
+        PriorityBased,
+        Heuristic
+    };
+
+    using HeuristicFunction = std::function<std::vector<std::string>(const GroupedConvProblem&)>;
+
+    explicit GroupedConvDispatcher(GroupedConvRegistry* registry)
+        : registry_(registry), strategy_(SelectionStrategy::PriorityBased)
+    {
+    }
+
+    void set_strategy(SelectionStrategy s) { strategy_ = s; }
+    void set_heuristic(HeuristicFunction fn) { heuristic_ = std::move(fn); }
+
+    /// Select the best kernel for a problem (does not run it)
+    const GroupedConvKernelInstance* select_kernel(const GroupedConvProblem& problem) const
+    {
+        if(strategy_ == SelectionStrategy::Heuristic)
+            return select_heuristic(problem);
+        return registry_->find(problem);
+    }
+
+    /// Run convolution with automatic kernel selection (legacy - no buffers)
+    float run(const GroupedConvProblem& problem, void* stream = nullptr)
+    {
+        const auto* kernel = select_kernel(problem);
+        if(!kernel)
+        {
+            throw NoKernelFound("No suitable grouped convolution kernel found for problem: " +
+                                problem.to_string());
+        }
+        return kernel->run(problem, stream);
+    }
+
+    /// Run convolution with buffer pointers and automatic kernel selection.
+    /// Sets the thread-local buffer context before dispatching to the kernel.
+    float run(const void* input_ptr,
+              const void* weight_ptr,
+              void* output_ptr,
+              const GroupedConvProblem& problem,
+              void* stream = nullptr,
+              int warmup   = 3,
+              int repeat   = 10)
+    {
+        const auto* kernel = select_kernel(problem);
+        if(!kernel)
+        {
+            throw NoKernelFound("No suitable grouped convolution kernel found for problem: " +
+                                problem.to_string());
+        }
+        g_conv_dispatch_buffers.input_ptr    = input_ptr;
+        g_conv_dispatch_buffers.weight_ptr   = weight_ptr;
+        g_conv_dispatch_buffers.output_ptr   = output_ptr;
+        g_conv_dispatch_buffers.warmup       = warmup;
+        g_conv_dispatch_buffers.repeat       = repeat;
+        g_conv_dispatch_buffers.benchmarking = benchmarking_;
+        g_conv_dispatch_buffers.split_k      = problem.split_k;
+        return kernel->run(problem, stream);
+    }
+
+    /// Enable or disable GPU benchmarking (timing).
+    /// When disabled, kernels execute once with no timing overhead.
+    void set_benchmarking(bool enable) { benchmarking_ = enable; }
+    [[nodiscard]] bool benchmarking_enabled() const { return benchmarking_; }
+
+    /// Alias kept for backward compatibility
+    const GroupedConvKernelInstance* select(const GroupedConvProblem& problem) const
+    {
+        return select_kernel(problem);
+    }
+
+    private:
+    const GroupedConvKernelInstance* select_heuristic(const GroupedConvProblem& problem) const
+    {
+        if(!heuristic_)
+            return registry_->find(problem);
+
+        auto ranked_names = heuristic_(problem);
+        auto all          = registry_->all_kernels();
+        for(const auto& name : ranked_names)
+        {
+            for(const auto* kernel : all)
+            {
+                if(kernel->name().find(name) != std::string::npos && kernel->matches(problem))
+                {
+                    return kernel;
+                }
+            }
+        }
+        return registry_->find(problem);
+    }
+
+    GroupedConvRegistry* registry_;
+    SelectionStrategy strategy_;
+    HeuristicFunction heuristic_;
+    bool benchmarking_ = true;
+};
+
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/include/ck_tile/dispatcher/grouped_conv_utils.hpp b/dispatcher/include/ck_tile/dispatcher/grouped_conv_utils.hpp
new file mode 100644
index 0000000000..c817d36673
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/grouped_conv_utils.hpp
@@ -0,0 +1,324 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file grouped_conv_utils.hpp
+ * @brief CK Tile Grouped Convolution Dispatcher Utilities
+ */
+
+#pragma once
+
+#include "ck_tile/dispatcher/grouped_conv_config.hpp"
+#include "ck_tile/dispatcher/grouped_conv_kernel_decl.hpp"
+#include "ck_tile/dispatcher/grouped_conv_problem.hpp"
+#include "ck_tile/dispatcher/grouped_conv_registry.hpp"
+#include "ck_tile/dispatcher/arch_filter.hpp"
+#include "ck_tile/dispatcher/utils.hpp"
+
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <vector>
+#include <string>
+#include <sstream>
+#include <functional>
+#include <cmath>
+#include <algorithm>
+
+namespace ck_tile {
+namespace dispatcher {
+
+using GroupedConvSig  = grouped_conv_decl::GroupedConvSignature;
+using GroupedConvAlgo = grouped_conv_decl::GroupedConvAlgorithm;
+
+namespace grouped_conv_utils {
+
+inline GroupedConvKernelDecl create_grouped_conv2d_fwd(const std::string& dtype = "fp16",
+                                                       int tile_n               = 128,
+                                                       int tile_k               = 128,
+                                                       const std::string& arch  = "gfx942")
+{
+    return GroupedConvKernelDecl(
+        GroupedConvSig().dtype(dtype).layout("nhwc").conv_type("forward").dims(2),
+        GroupedConvAlgo()
+            .tile(1, tile_n, tile_k)
+            .wave(2, 2, 1)
+            .warp(32, 32, 16)
+            .pipeline("compv4")
+            .vector_sizes(4, 8, 8),
+        arch);
+}
+
+inline GroupedConvKernelDecl create_grouped_conv3d_fwd(const std::string& dtype = "fp16",
+                                                       int tile_n               = 64,
+                                                       int tile_k               = 64,
+                                                       const std::string& arch  = "gfx942")
+{
+    return GroupedConvKernelDecl(
+        GroupedConvSig().dtype(dtype).layout("ndhwc").conv_type("forward").dims(3),
+        GroupedConvAlgo()
+            .tile(1, tile_n, tile_k)
+            .wave(2, 2, 1)
+            .warp(16, 16, 32)
+            .pipeline("compv3")
+            .vector_sizes(4, 8, 8),
+        arch);
+}
+
+inline GroupedConvKernelDecl create_grouped_conv2d_bwd_data(const std::string& dtype = "fp16",
+                                                            int tile_n               = 128,
+                                                            int tile_k               = 128,
+                                                            const std::string& arch  = "gfx942")
+{
+    return GroupedConvKernelDecl(
+        GroupedConvSig().dtype(dtype).layout("nhwc").conv_type("bwd_data").dims(2),
+        GroupedConvAlgo()
+            .tile(1, tile_n, tile_k)
+            .wave(2, 2, 1)
+            .warp(32, 32, 16)
+            .pipeline("compv3")
+            .vector_sizes(4, 8, 8),
+        arch);
+}
+
+inline GroupedConvKernelDecl create_grouped_conv2d_bwd_weight(const std::string& dtype = "fp16",
+                                                              int tile_n               = 128,
+                                                              int tile_k               = 128,
+                                                              const std::string& arch  = "gfx942")
+{
+    return GroupedConvKernelDecl(
+        GroupedConvSig().dtype(dtype).layout("nhwc").conv_type("bwd_weight").dims(2),
+        GroupedConvAlgo()
+            .tile(1, tile_n, tile_k)
+            .wave(2, 2, 1)
+            .warp(32, 32, 16)
+            .pipeline("compv3")
+            .memory_op("atomic_add")
+            .vector_sizes(4, 8, 8),
+        arch);
+}
+
+inline GroupedConvProblem create_grouped_conv2d_problem(int N,
+                                                        int C,
+                                                        int K,
+                                                        int Hi,
+                                                        int Wi,
+                                                        int Y,
+                                                        int X,
+                                                        int stride       = 1,
+                                                        int padding      = 0,
+                                                        GroupedConvOp op = GroupedConvOp::Forward)
+{
+    GroupedConvProblem p;
+    p.N              = N;
+    p.C              = C;
+    p.K              = K;
+    p.G              = 1;
+    p.input_spatial  = {1, Hi, Wi};
+    p.filter_spatial = {1, Y, X};
+    p.stride         = {1, stride, stride};
+    p.padding        = {0, padding, padding};
+    p.dilation       = {1, 1, 1};
+    p.op             = op;
+    p.compute_output_size();
+    return p;
+}
+
+inline GroupedConvProblem create_grouped_conv3d_problem(int N,
+                                                        int C,
+                                                        int K,
+                                                        int Di,
+                                                        int Hi,
+                                                        int Wi,
+                                                        int Z,
+                                                        int Y,
+                                                        int X,
+                                                        int stride       = 1,
+                                                        int padding      = 0,
+                                                        GroupedConvOp op = GroupedConvOp::Forward)
+{
+    GroupedConvProblem p;
+    p.N              = N;
+    p.C              = C;
+    p.K              = K;
+    p.G              = 1;
+    p.input_spatial  = {Di, Hi, Wi};
+    p.filter_spatial = {Z, Y, X};
+    p.stride         = {stride, stride, stride};
+    p.padding        = {padding, padding, padding};
+    p.dilation       = {1, 1, 1};
+    p.op             = op;
+    p.compute_output_size();
+    return p;
+}
+
+inline GroupedConvProblem create_depthwise_grouped_conv2d_problem(
+    int N, int C, int Hi, int Wi, int Y, int X, int stride = 1, int padding = 0)
+{
+    GroupedConvProblem p;
+    p.N              = N;
+    p.C              = C;
+    p.K              = C;
+    p.G              = C;
+    p.input_spatial  = {1, Hi, Wi};
+    p.filter_spatial = {1, Y, X};
+    p.stride         = {1, stride, stride};
+    p.padding        = {0, padding, padding};
+    p.dilation       = {1, 1, 1};
+    p.op             = GroupedConvOp::Forward;
+    p.compute_output_size();
+    return p;
+}
+
+inline void print_pattern_docs(std::ostream& os = std::cout)
+{
+    os << "Grouped Convolution Pattern Documentation\n";
+    os << "==========================================\n";
+    os << "Signature patterns: dtype, layout, conv_type (forward/bwd_data/bwd_weight), dims "
+          "(2/3)\n";
+    os << "Algorithm patterns: tile(M,N,K), wave(M,N,K), warp(M,N,K), pipeline, vector_sizes\n";
+    os << "Arch patterns: gfx942, gfx90a, gfx950, or '*' for all\n";
+}
+
+inline void print_grouped_conv_kernel_decl(const GroupedConvKernelDecl& decl,
+                                           std::ostream& os = std::cout)
+{
+    os << "GroupedConvKernelDecl: " << decl.name() << "\n";
+    os << "  Signature: dtype=" << decl.signature.dtype_in_ << ", layout=" << decl.signature.layout_
+       << ", conv_type=" << decl.signature.conv_op_ << ", dims=" << decl.signature.num_dims_
+       << "\n";
+    os << "  Algorithm: tile=" << decl.algorithm.tile_m_ << "x" << decl.algorithm.tile_n_ << "x"
+       << decl.algorithm.tile_k_ << ", wave=" << decl.algorithm.wave_m_ << "x"
+       << decl.algorithm.wave_n_ << "x" << decl.algorithm.wave_k_
+       << ", warp=" << decl.algorithm.warp_m_ << "x" << decl.algorithm.warp_n_ << "x"
+       << decl.algorithm.warp_k_ << ", pipeline=" << decl.algorithm.pipeline_ << "\n";
+    os << "  Arch: " << decl.arch << "\n";
+}
+
+inline void print_grouped_conv_problem(const GroupedConvProblem& p, std::ostream& os = std::cout)
+{
+    os << p.to_string() << "\n";
+    os << "  FLOPs: " << std::scientific << p.get_flops() << "\n";
+}
+
+inline GroupedConvKernelSet build_grouped_conv2d_fwd_set(const std::string& dtype = "fp16",
+                                                         const std::string& arch  = "gfx942")
+{
+    GroupedConvKernelSet set;
+    auto decl1 = create_grouped_conv2d_fwd(dtype, 128, 128, arch);
+    set.add(decl1.signature, decl1.algorithm, decl1.arch);
+    auto decl2 = create_grouped_conv2d_fwd(dtype, 256, 256, arch);
+    set.add(decl2.signature, decl2.algorithm, decl2.arch);
+    return set;
+}
+
+inline GroupedConvKernelSet build_grouped_conv2d_full_set(const std::string& dtype = "fp16",
+                                                          const std::string& arch  = "gfx942")
+{
+    GroupedConvKernelSet set;
+    set.merge(build_grouped_conv2d_fwd_set(dtype, arch));
+    auto bwd_data = create_grouped_conv2d_bwd_data(dtype, 128, 128, arch);
+    set.add(bwd_data.signature, bwd_data.algorithm, bwd_data.arch);
+    auto bwd_weight = create_grouped_conv2d_bwd_weight(dtype, 128, 128, arch);
+    set.add(bwd_weight.signature, bwd_weight.algorithm, bwd_weight.arch);
+    return set;
+}
+
+struct ValidationResult
+{
+    bool passed        = false;
+    float max_abs_diff = 0.0f;
+    float max_rel_diff = 0.0f;
+    float rtol         = 1e-3f;
+    float atol         = 1e-3f;
+
+    void print(std::ostream& os = std::cout) const
+    {
+        os << "ValidationResult: " << (passed ? "PASSED" : "FAILED") << "\n";
+        os << "  max_abs_diff: " << max_abs_diff << ", max_rel_diff: " << max_rel_diff << "\n";
+        os << "  rtol: " << rtol << ", atol: " << atol << "\n";
+    }
+};
+
+template <typename T>
+inline ValidationResult validate_buffers(
+    const T* result, const T* reference, size_t count, float rtol = 1e-3f, float atol = 1e-3f)
+{
+    ValidationResult vr;
+    vr.rtol   = rtol;
+    vr.atol   = atol;
+    vr.passed = true;
+
+    for(size_t i = 0; i < count; ++i)
+    {
+        float r        = static_cast<float>(result[i]);
+        float ref      = static_cast<float>(reference[i]);
+        float abs_diff = std::abs(r - ref);
+        float rel_diff = (std::abs(ref) > 1e-10f) ? (abs_diff / std::abs(ref)) : 0.0f;
+
+        vr.max_abs_diff = std::max(vr.max_abs_diff, abs_diff);
+        vr.max_rel_diff = std::max(vr.max_rel_diff, rel_diff);
+
+        float threshold = atol + rtol * std::abs(ref);
+        if(abs_diff > threshold)
+        {
+            vr.passed = false;
+        }
+    }
+
+    return vr;
+}
+
+struct BenchmarkResult
+{
+    std::string kernel_name;
+    float time_ms      = 0.0f;
+    float tflops       = 0.0f;
+    int warmup_runs    = 0;
+    int benchmark_runs = 0;
+
+    void print(std::ostream& os = std::cout) const
+    {
+        os << "BenchmarkResult: " << kernel_name << "\n";
+        os << "  time_ms: " << time_ms << ", tflops: " << tflops << "\n";
+        os << "  warmup_runs: " << warmup_runs << ", benchmark_runs: " << benchmark_runs << "\n";
+    }
+};
+
+inline float calc_tflops(double flops, float time_ms)
+{
+    return static_cast<float>(flops / (time_ms * 1e9));
+}
+
+inline double calculate_conv_tflops(const GroupedConvProblem& problem, double time_ms)
+{
+    return problem.get_flops() / (time_ms * 1e9);
+}
+
+} // namespace grouped_conv_utils
+
+namespace examples {
+inline int basic_grouped_conv_example_main(const std::string& example_name)
+{
+    std::cout << "=== " << example_name << " ===\n";
+
+    // Create a grouped convolution problem
+    auto problem = grouped_conv_utils::create_grouped_conv2d_problem(
+        32, 64, 128, 28, 28, 3, 3, 1, 1, GroupedConvOp::Forward);
+
+    grouped_conv_utils::print_grouped_conv_problem(problem);
+
+    // Create and print a kernel declaration
+    auto decl = grouped_conv_utils::create_grouped_conv2d_fwd("fp16", 128, 128, "gfx942");
+    grouped_conv_utils::print_grouped_conv_kernel_decl(decl);
+
+    // Build and print kernel set
+    auto kernel_set = grouped_conv_utils::build_grouped_conv2d_fwd_set("fp16", "gfx942");
+    kernel_set.print();
+
+    return 0;
+}
+} // namespace examples
+
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp b/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp
index f49b3a0d74..f5a93c6d34 100644
--- a/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp
@@ -140,6 +140,11 @@ struct KernelKey
         bool preshuffle;              // Preshuffle (for weight preshuffle variants)
         bool transpose_c;             // TransposeC
         std::uint8_t num_wave_groups; // NumWaveGroups
+
+        // Padding support flags (kPadM, kPadN, kPadK in generated kernels)
+        bool pad_m = true; // Support arbitrary M dimensions via padding
+        bool pad_n = true; // Support arbitrary N dimensions via padding
+        bool pad_k = true; // Support arbitrary K dimensions via padding
     } algorithm;
 
     std::string gfx_arch; // e.g. "gfx942", "gfx90a", "gfx908"
@@ -185,7 +190,10 @@ struct KernelKey
                         algorithm.double_buffer,
                         algorithm.preshuffle,
                         algorithm.transpose_c,
-                        algorithm.num_wave_groups);
+                        algorithm.num_wave_groups,
+                        algorithm.pad_m,
+                        algorithm.pad_n,
+                        algorithm.pad_k);
     }
 
     /// Equality comparison
@@ -397,8 +405,14 @@ inline std::string KernelKey::encode_identifier() const
 
     // Include pipeline, scheduler, epilogue for uniqueness
     oss << to_string(algorithm.pipeline) << "_";
-    oss << to_string(algorithm.scheduler) << "_";
     oss << to_string(algorithm.epilogue) << "_";
+    oss << to_string(algorithm.scheduler) << "_";
+
+    // Match tile_engine naming: padding flags (True/False) then persistent flag
+    oss << (algorithm.pad_m ? "True" : "False") << "_";
+    oss << (algorithm.pad_n ? "True" : "False") << "_";
+    oss << (algorithm.pad_k ? "True" : "False") << "_";
+    oss << (algorithm.persistent ? "True" : "False") << "_";
 
     // Match tile_engine naming: tile_m x tile_n x tile_k _ warp_m x warp_n x warp_k _
     // warp_tile_m x warp_tile_n x warp_tile_k
@@ -407,9 +421,6 @@ inline std::string KernelKey::encode_identifier() const
         << unsigned(algorithm.wave_shape.k) << "_" << unsigned(algorithm.warp_tile_shape.m) << "x"
         << unsigned(algorithm.warp_tile_shape.n) << "x" << unsigned(algorithm.warp_tile_shape.k);
 
-    // Add trait flags
-    oss << "_" << (algorithm.persistent ? "persist" : "nopers");
-
     if(signature.split_k > 1)
         oss << "_splitk" << unsigned(signature.split_k);
     if(!signature.elementwise_op.empty() && signature.elementwise_op != "PassThrough")
diff --git a/dispatcher/include/ck_tile/dispatcher/ml_heuristic.hpp b/dispatcher/include/ck_tile/dispatcher/ml_heuristic.hpp
new file mode 100644
index 0000000000..359d772735
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/ml_heuristic.hpp
@@ -0,0 +1,379 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+#include "ck_tile/dispatcher/dispatcher.hpp"
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include "ck_tile/dispatcher/registry.hpp"
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+namespace ck_tile {
+namespace dispatcher {
+extern "C" {
+int LGBM_BoosterCreateFromModelfile(const char*, int*, void**);
+int LGBM_BoosterPredictForMat(
+    void*, const void*, int, int, int, int, int, int, int, const char*, int64_t*, double*);
+int LGBM_BoosterFree(void*);
+}
+inline int encode_pipeline(Pipeline p)
+{
+    switch(p)
+    {
+    case Pipeline::CompV3: return 0;
+    case Pipeline::CompV4: return 1;
+    case Pipeline::CompV5: return 2;
+    case Pipeline::Mem: return 3;
+    case Pipeline::PreShuffleV2: return 4;
+    default: return 0;
+    }
+}
+inline int encode_scheduler(Scheduler s)
+{
+    switch(s)
+    {
+    case Scheduler::Intrawave: return 0;
+    case Scheduler::Interwave: return 1;
+    default: return 0;
+    }
+}
+inline int encode_epilogue(Epilogue e)
+{
+    switch(e)
+    {
+    case Epilogue::Default: return 0;
+    case Epilogue::CShuffle: return 1;
+    default: return 0;
+    }
+}
+inline int encode_layout(LayoutTag a, LayoutTag b, LayoutTag c)
+{
+    bool ra = (a == LayoutTag::RowMajor), rb = (b == LayoutTag::RowMajor);
+    if(ra && !rb)
+        return 0; // RCR
+    if(ra && rb)
+        return 1; // RRR
+    if(!ra && rb)
+        return 2; // CCR
+    return 3;     // CRR
+}
+inline double dtype_bytes_ml(DataType dt)
+{
+    switch(dt)
+    {
+    case DataType::FP32: return 4;
+    case DataType::FP16:
+    case DataType::BF16: return 2;
+    case DataType::FP8:
+    case DataType::BF8:
+    case DataType::INT8: return 1;
+    case DataType::INT4: return 0.5;
+    default: return 2;
+    }
+}
+struct HardwareProfile
+{
+    int num_cus = 256, simds_per_cu = 4, shader_engines = 32, max_clock_mhz = 2400,
+        max_waves_per_cu = 32, wavefront_size = 64, lds_capacity = 65536, l1_cache_kb = 32,
+        l2_cache_kb = 4096, l3_cache_kb = 262144, num_xcd = 8;
+    int total_simds() const { return num_cus * simds_per_cu; }
+};
+
+// CRITICAL: Feature count MUST match feature_spec.json
+// Python training uses 72 features - this header MUST extract exactly 72 features in the same order
+static constexpr int NUM_FEATURES = 72;
+
+inline std::array<double, NUM_FEATURES>
+extract_features(const Problem& prob, const KernelKey& key, const HardwareProfile& hw)
+{
+    // Problem dimensions
+    double M = prob.M, N = prob.N, K = prob.K;
+    double sk  = (prob.k_batch > 0 ? prob.k_batch : 1);
+    double bpe = dtype_bytes_ml(key.signature.dtype_a);
+
+    // Log-scale features
+    double l2M   = std::log2(std::max(M, 1.0));
+    double l2N   = std::log2(std::max(N, 1.0));
+    double l2K   = std::log2(std::max(K, 1.0));
+    double l2MNK = std::log2(std::max(M * N * K, 1.0));
+
+    // Arithmetic intensity
+    double mem = (M * K + K * N + M * N) * bpe;
+    double ai  = 2.0 * M * N * K / std::max(mem, 1.0);
+
+    // Aspect ratios
+    double ar_mn = M / std::max(N, 1.0);
+    double ar_mk = M / std::max(K, 1.0);
+    double ar_nk = N / std::max(K, 1.0);
+
+    // Layout encoding
+    double layout = (double)encode_layout(
+        key.signature.layout_a, key.signature.layout_b, key.signature.layout_c);
+
+    // Tile dimensions
+    double tm = key.algorithm.tile_shape.m;
+    double tn = key.algorithm.tile_shape.n;
+    double tk = key.algorithm.tile_shape.k;
+
+    // Wave/warp dimensions
+    double wm = key.algorithm.wave_shape.m;
+    double wn = key.algorithm.wave_shape.n;
+    double wk = key.algorithm.wave_shape.k;
+
+    // Warp tile dimensions
+    double wtm = key.algorithm.warp_tile_shape.m;
+    double wtn = key.algorithm.warp_tile_shape.n;
+    double wtk = key.algorithm.warp_tile_shape.k;
+
+    // Algorithm encoding
+    double pipeline  = (double)encode_pipeline(key.algorithm.pipeline);
+    double scheduler = (double)encode_scheduler(key.algorithm.scheduler);
+    double epilogue  = (double)encode_epilogue(key.algorithm.epilogue);
+
+    // Padding flags - read from KernelKey
+    double pad_m = key.algorithm.pad_m ? 1.0 : 0.0;
+    double pad_n = key.algorithm.pad_n ? 1.0 : 0.0;
+    double pad_k = key.algorithm.pad_k ? 1.0 : 0.0;
+
+    // Persistent kernel flag
+    double persistent = key.algorithm.persistent ? 1.0 : 0.0;
+
+    // Derived features
+    double num_warps   = wm * wn * wk;
+    double tile_volume = tm * tn * tk;
+    double tile_mn     = tm * tn;
+
+    // LDS usage estimation
+    double lest = (tm * tk + tn * tk) * bpe;
+    double lcap = (key.algorithm.pipeline == Pipeline::CompV4) ? 32768.0 : (double)hw.lds_capacity;
+    double lds_ratio = lest / std::max(lcap, 1.0);
+
+    // Tile counts
+    double ntm                = std::ceil(M / std::max(tm, 1.0));
+    double ntn                = std::ceil(N / std::max(tn, 1.0));
+    double ntk                = std::ceil(K / std::max(tk, 1.0));
+    double total_output_tiles = ntm * ntn;
+
+    // Tile efficiency (fractional remainder utilization)
+    auto ef = [](double d, double t) -> double {
+        if(t <= 0)
+            return 1.0;
+        double r = std::fmod(d, t);
+        return r > 0 ? r / t : 1.0;
+    };
+    double tile_eff_m              = ef(M, tm);
+    double tile_eff_n              = ef(N, tn);
+    double tile_eff_k              = ef(K, tk);
+    double overall_tile_efficiency = tile_eff_m * tile_eff_n * tile_eff_k;
+
+    // CU utilization
+    double cu_utilization = total_output_tiles / std::max((double)hw.num_cus, 1.0);
+
+    // P0 FIX: Problem-to-tile ratio features (critical for small problems)
+    double ratio_M_to_tile_m = M / std::max(tm, 1.0);
+    double ratio_N_to_tile_n = N / std::max(tn, 1.0);
+    double ratio_K_to_tile_k = K / std::max(tk, 1.0);
+
+    // Binary features: is problem dimension smaller than tile?
+    double problem_smaller_than_tile_m = (M < tm) ? 1.0 : 0.0;
+    double problem_smaller_than_tile_n = (N < tn) ? 1.0 : 0.0;
+    double problem_smaller_than_tile_k = (K < tk) ? 1.0 : 0.0;
+    double any_dim_too_small           = ((M < tm) || (N < tn) || (K < tk)) ? 1.0 : 0.0;
+
+    // P1 FIX: Padding requirement features
+    double needs_padding_m = (tm > 0 && std::fmod(M, tm) != 0.0) ? 1.0 : 0.0;
+    double needs_padding_n = (tn > 0 && std::fmod(N, tn) != 0.0) ? 1.0 : 0.0;
+    double needs_padding_k = (tk > 0 && std::fmod(K, tk) != 0.0) ? 1.0 : 0.0;
+
+    // Interaction features: kernel has padding when problem needs it
+    double has_padding_when_needed_m = (needs_padding_m && pad_m) ? 1.0 : 0.0;
+    double has_padding_when_needed_n = (needs_padding_n && pad_n) ? 1.0 : 0.0;
+    double has_padding_when_needed_k = (needs_padding_k && pad_k) ? 1.0 : 0.0;
+
+    // Critical feature: missing required padding (kernel will likely fail)
+    double missing_required_padding_m = (needs_padding_m && !pad_m) ? 1.0 : 0.0;
+    double missing_required_padding_n = (needs_padding_n && !pad_n) ? 1.0 : 0.0;
+    double missing_required_padding_k = (needs_padding_k && !pad_k) ? 1.0 : 0.0;
+    double missing_any_required_padding =
+        (missing_required_padding_m || missing_required_padding_n || missing_required_padding_k)
+            ? 1.0
+            : 0.0;
+
+    // Hardware features
+    double hw_num_cus          = (double)hw.num_cus;
+    double hw_simds_per_cu     = (double)hw.simds_per_cu;
+    double hw_total_simds      = (double)hw.total_simds();
+    double hw_shader_engines   = (double)hw.shader_engines;
+    double hw_max_clock_mhz    = (double)hw.max_clock_mhz;
+    double hw_max_waves_per_cu = (double)hw.max_waves_per_cu;
+    double hw_wavefront_size   = (double)hw.wavefront_size;
+    double hw_lds_capacity     = (double)hw.lds_capacity;
+    double hw_l1_cache_kb      = (double)hw.l1_cache_kb;
+    double hw_l2_cache_kb      = (double)hw.l2_cache_kb;
+    double hw_l3_cache_kb      = (double)hw.l3_cache_kb;
+    double hw_num_xcd          = (double)hw.num_xcd;
+
+    // Feature vector in EXACT order from feature_spec.json
+    // This order MUST match Python feature_engine.py::get_feature_names()
+    return {{
+        M,                            // 0
+        N,                            // 1
+        K,                            // 2
+        sk,                           // 3 (split_k)
+        l2M,                          // 4 (log2_M)
+        l2N,                          // 5 (log2_N)
+        l2K,                          // 6 (log2_K)
+        l2MNK,                        // 7 (log2_MNK)
+        ai,                           // 8 (arithmetic_intensity)
+        ar_mn,                        // 9 (aspect_ratio_mn)
+        ar_mk,                        // 10 (aspect_ratio_mk)
+        ar_nk,                        // 11 (aspect_ratio_nk)
+        layout,                       // 12 (layout)
+        tm,                           // 13 (tile_m)
+        tn,                           // 14 (tile_n)
+        tk,                           // 15 (tile_k)
+        wm,                           // 16 (warp_m)
+        wn,                           // 17 (warp_n)
+        wk,                           // 18 (warp_k)
+        wtm,                          // 19 (warp_tile_m)
+        wtn,                          // 20 (warp_tile_n)
+        wtk,                          // 21 (warp_tile_k)
+        pipeline,                     // 22 (pipeline)
+        scheduler,                    // 23 (scheduler)
+        epilogue,                     // 24 (epilogue)
+        pad_m,                        // 25 (pad_m)
+        pad_n,                        // 26 (pad_n)
+        pad_k,                        // 27 (pad_k)
+        persistent,                   // 28 (persistent)
+        num_warps,                    // 29 (num_warps)
+        tile_volume,                  // 30 (tile_volume)
+        tile_mn,                      // 31 (tile_mn)
+        lest,                         // 32 (lds_usage_estimate)
+        lds_ratio,                    // 33 (lds_usage_ratio)
+        ntm,                          // 34 (num_tiles_m)
+        ntn,                          // 35 (num_tiles_n)
+        ntk,                          // 36 (num_tiles_k)
+        total_output_tiles,           // 37 (total_output_tiles)
+        tile_eff_m,                   // 38 (tile_eff_m)
+        tile_eff_n,                   // 39 (tile_eff_n)
+        tile_eff_k,                   // 40 (tile_eff_k)
+        overall_tile_efficiency,      // 41 (overall_tile_efficiency)
+        cu_utilization,               // 42 (cu_utilization)
+        ratio_M_to_tile_m,            // 43 (ratio_M_to_tile_m)
+        ratio_N_to_tile_n,            // 44 (ratio_N_to_tile_n)
+        ratio_K_to_tile_k,            // 45 (ratio_K_to_tile_k)
+        problem_smaller_than_tile_m,  // 46 (problem_smaller_than_tile_m)
+        problem_smaller_than_tile_n,  // 47 (problem_smaller_than_tile_n)
+        problem_smaller_than_tile_k,  // 48 (problem_smaller_than_tile_k)
+        any_dim_too_small,            // 49 (any_dim_too_small)
+        needs_padding_m,              // 50 (needs_padding_m)
+        needs_padding_n,              // 51 (needs_padding_n)
+        needs_padding_k,              // 52 (needs_padding_k)
+        has_padding_when_needed_m,    // 53 (has_padding_when_needed_m)
+        has_padding_when_needed_n,    // 54 (has_padding_when_needed_n)
+        has_padding_when_needed_k,    // 55 (has_padding_when_needed_k)
+        missing_required_padding_m,   // 56 (missing_required_padding_m)
+        missing_required_padding_n,   // 57 (missing_required_padding_n)
+        missing_required_padding_k,   // 58 (missing_required_padding_k)
+        missing_any_required_padding, // 59 (missing_any_required_padding)
+        hw_num_cus,                   // 60 (hw_num_cus)
+        hw_simds_per_cu,              // 61 (hw_simds_per_cu)
+        hw_total_simds,               // 62 (hw_total_simds)
+        hw_shader_engines,            // 63 (hw_shader_engines)
+        hw_max_clock_mhz,             // 64 (hw_max_clock_mhz)
+        hw_max_waves_per_cu,          // 65 (hw_max_waves_per_cu)
+        hw_wavefront_size,            // 66 (hw_wavefront_size)
+        hw_lds_capacity,              // 67 (hw_lds_capacity)
+        hw_l1_cache_kb,               // 68 (hw_l1_cache_kb)
+        hw_l2_cache_kb,               // 69 (hw_l2_cache_kb)
+        hw_l3_cache_kb,               // 70 (hw_l3_cache_kb)
+        hw_num_xcd,                   // 71 (hw_num_xcd)
+    }};
+}
+
+class MLHeuristic
+{
+    public:
+    MLHeuristic(const std::string& path,
+                const Registry* reg,
+                HardwareProfile hw = {},
+                bool log_t         = false)
+        : registry_(reg), hw_(hw), log_t_(log_t)
+    {
+        int iters = 0;
+        if(LGBM_BoosterCreateFromModelfile(path.c_str(), &iters, &b_) != 0 || !b_)
+        {
+            std::cerr << "MLHeuristic: Failed to load " << path << std::endl;
+
+            // Check if a compressed .gz version exists
+            std::string gz_path = path + ".gz";
+            std::ifstream gz_check(gz_path);
+            if(gz_check.good())
+            {
+                std::cerr << "MLHeuristic: Found compressed model at " << gz_path << std::endl;
+                std::cerr << "MLHeuristic: Please decompress it first:" << std::endl;
+                std::cerr << "  gunzip " << gz_path << std::endl;
+            }
+
+            b_ = nullptr;
+        }
+        else
+            std::cout << "MLHeuristic: Loaded (" << iters << " iters)" << std::endl;
+    }
+    ~MLHeuristic()
+    {
+        if(b_)
+            LGBM_BoosterFree(b_);
+    }
+    MLHeuristic(const MLHeuristic&)            = delete;
+    MLHeuristic& operator=(const MLHeuristic&) = delete;
+    bool is_loaded() const { return b_ != nullptr; }
+    double predict_tflops(const Problem& prob, const KernelKey& key) const
+    {
+        if(!b_)
+            return 0;
+        auto f      = extract_features(prob, key, hw_);
+        int64_t ol  = 0;
+        double pred = 0;
+        if(LGBM_BoosterPredictForMat(
+               b_, f.data(), 0, 1, NUM_FEATURES, 1, 0, 0, 0, "", &ol, &pred) != 0)
+            return 0;
+        return log_t_ ? std::expm1(pred) : pred;
+    }
+    std::vector<std::string> operator()(const Problem& prob) const
+    {
+        if(!b_ || !registry_)
+            return {};
+        auto insts = registry_->get_all();
+        struct C
+        {
+            std::string id;
+            double t;
+        };
+        std::vector<C> cs;
+        cs.reserve(insts.size());
+        for(auto& i : insts)
+        {
+            auto& k = i->get_key();
+            cs.push_back({k.encode_identifier(), predict_tflops(prob, k)});
+        }
+        std::sort(cs.begin(), cs.end(), [](auto& a, auto& b) { return a.t > b.t; });
+        std::vector<std::string> r;
+        r.reserve(cs.size());
+        for(auto& c : cs)
+            r.push_back(std::move(c.id));
+        return r;
+    }
+
+    private:
+    void* b_                  = nullptr;
+    const Registry* registry_ = nullptr;
+    HardwareProfile hw_;
+    bool log_t_ = false;
+};
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/include/ck_tile/dispatcher/problem.hpp b/dispatcher/include/ck_tile/dispatcher/problem.hpp
index 437511d1ba..5bffb56b49 100644
--- a/dispatcher/include/ck_tile/dispatcher/problem.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/problem.hpp
@@ -98,7 +98,7 @@ struct Problem
     /**
      * Create Problem by inferring MNK from tensor shapes.
      *
-     * For GEMM: C[M,N] = A[M,K] × B[K,N]
+     * For GEMM: C[M,N] = A[M,K] x B[K,N]
      *
      * @param a_shape Shape of matrix A (M x K, or K x M if transposed)
      * @param b_shape Shape of matrix B (K x N, or N x K if transposed)
@@ -113,7 +113,7 @@ struct Problem
     [[nodiscard]] static Problem
     from_shapes(TensorShape a_shape, TensorShape b_shape, TensorShape c_shape)
     {
-        // For C = A × B:
+        // For C = A x B:
         // A: [M, K] (or [K, M] if transposed)
         // B: [K, N] (or [N, K] if transposed)
         // C: [M, N]
@@ -164,7 +164,7 @@ struct Problem
      * @throws std::invalid_argument if dimensions are inconsistent
      *
      * Example:
-     *   // A[512,256] × B[256,1024] = C[512,1024]
+     *   // A[512,256] x B[256,1024] = C[512,1024]
      *   auto problem = Problem::from_dimensions(512, 256, 256, 1024, 512, 1024);
      */
     [[nodiscard]] static Problem from_dimensions(std::int64_t a_rows,
@@ -188,7 +188,7 @@ struct Problem
      * @throws std::invalid_argument if K dimensions don't match
      *
      * Example:
-     *   // A[512,256] × B[256,1024] = C[512,1024]
+     *   // A[512,256] x B[256,1024] = C[512,1024]
      *   auto problem = Problem::from_ab(512, 256, 256, 1024);
      */
     [[nodiscard]] static Problem
diff --git a/dispatcher/include/ck_tile/dispatcher/registry.hpp b/dispatcher/include/ck_tile/dispatcher/registry.hpp
index 93d1eb9f64..4f34e589ea 100644
--- a/dispatcher/include/ck_tile/dispatcher/registry.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/registry.hpp
@@ -7,38 +7,20 @@
  * Central registry for all available kernel instances with priority-based
  * ordering and efficient lookup.
  *
- * Features:
- * - Thread-safe registration and lookup
- * - Priority-based ordering (High, Normal, Low)
- * - Lookup by name or KernelKey
- * - Filter by problem compatibility
- * - Supports both singleton and multiple instance patterns
- *
- * Usage (Singleton - backward compatible):
- *   auto& registry = Registry::instance();
- *   registry.register_kernel(kernel, Priority::High);
- *   auto kernel = registry.lookup("kernel_name");
- *
- * Usage (Multiple registries):
- *   Registry fp16_registry;
- *   Registry bf16_registry;
- *   fp16_registry.register_kernel(fp16_kernel, Priority::High);
- *   bf16_registry.register_kernel(bf16_kernel, Priority::High);
- *
- *   Dispatcher fp16_dispatcher(&fp16_registry);
- *   Dispatcher bf16_dispatcher(&bf16_registry);
+ * Derives from BaseRegistry for shared logic (thread safety, naming, priority,
+ * merge) while keeping GEMM-specific APIs (lookup by KernelKey, filter_by_arch,
+ * JSON export, auto-export).
  *
  * Status: Production ready, thread-safe
  */
 
 #pragma once
 
+#include "ck_tile/dispatcher/base_registry.hpp"
 #include "ck_tile/dispatcher/kernel_instance.hpp"
 #include "ck_tile/dispatcher/kernel_key.hpp"
 #include <functional>
-#include <mutex>
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include <memory>
 
@@ -47,20 +29,16 @@ namespace dispatcher {
 
 /// Registry: Central mapping from kernel configurations to executable instances
 /// Thread-safe kernel registration and lookup
-/// Supports both singleton pattern and multiple independent instances
-class Registry
+/// Derives from BaseRegistry<Registry, std::string, KernelInstance> for shared functionality
+class Registry : public BaseRegistry<Registry, std::string, KernelInstance>
 {
+    using Base = BaseRegistry<Registry, std::string, KernelInstance>;
+
     public:
-    /// Priority levels for conflict resolution when multiple kernels have same key
-    enum class Priority
-    {
-        Low    = 0,
-        Normal = 1,
-        High   = 2
-    };
+    // Re-export Priority from the shared enum for backward compatibility
+    using Priority = ck_tile::dispatcher::Priority;
 
     /// Default constructor - creates an empty registry instance
-    /// Use this to create independent registries for different kernel sets
     Registry();
 
     /// Destructor - triggers auto-export if enabled
@@ -72,106 +50,51 @@ class Registry
     /// Move assignment
     Registry& operator=(Registry&& other) noexcept;
 
-    // Prevent copying (registries contain shared_ptrs that shouldn't be duplicated)
+    // Prevent copying
     Registry(const Registry&)            = delete;
     Registry& operator=(const Registry&) = delete;
 
     /// Register a kernel instance with the registry
-    /// @param instance Kernel instance to register
-    /// @param priority Priority level for conflict resolution (default: Normal)
-    /// @return true if registered successfully, false if duplicate with higher priority exists
     bool register_kernel(KernelInstancePtr instance, Priority priority = Priority::Normal);
 
     /// Lookup a kernel by its string identifier
-    /// @param identifier Kernel identifier string
-    /// @return Kernel instance if found, nullptr otherwise
     [[nodiscard]] KernelInstancePtr lookup(const std::string& identifier) const;
 
     /// Lookup a kernel by its KernelKey
-    /// @param key Kernel configuration key
-    /// @return Kernel instance if found, nullptr otherwise
     [[nodiscard]] KernelInstancePtr lookup(const KernelKey& key) const;
 
     /// Get all registered kernels
-    /// @return Vector of all kernel instances
     [[nodiscard]] std::vector<KernelInstancePtr> get_all() const;
 
     /// Get all kernels matching a predicate
-    /// @param predicate Function to filter kernels
-    /// @return Vector of matching kernel instances
     [[nodiscard]] std::vector<KernelInstancePtr>
     filter(std::function<bool(const KernelInstance&)> predicate) const;
 
-    /// Get number of registered kernels
-    [[nodiscard]] std::size_t size() const;
-
-    /// Check if registry is empty
-    [[nodiscard]] bool empty() const;
-
-    /// Clear all registered kernels
-    void clear();
-
-    /// Get registry name (for logging/debugging)
-    [[nodiscard]] const std::string& get_name() const;
-
-    /// Set registry name (for logging/debugging)
-    void set_name(const std::string& name);
+    // size(), empty(), clear(), get_name(), set_name(), merge_from() inherited from Base
 
     /// Export registry to JSON string
-    /// @param include_statistics Whether to include kernel statistics breakdown
-    /// @return JSON string with all kernel metadata
     [[nodiscard]] std::string export_json(bool include_statistics = true) const;
 
     /// Export registry to JSON file
-    /// @param filename Output filename
-    /// @param include_statistics Whether to include kernel statistics breakdown
-    /// @return true if export succeeded, false otherwise
     bool export_json_to_file(const std::string& filename, bool include_statistics = true) const;
 
-    /// Enable automatic JSON export on kernel registration
-    /// @param filename Output filename for auto-export
-    /// @param include_statistics Whether to include statistics in auto-export
-    /// @param export_on_every_registration If true, exports after every registration (default).
-    ///                                      If false, only exports on destruction.
     void enable_auto_export(const std::string& filename,
                             bool include_statistics           = true,
                             bool export_on_every_registration = true);
 
-    /// Disable automatic JSON export
     void disable_auto_export();
 
-    /// Check if auto-export is enabled
     [[nodiscard]] bool is_auto_export_enabled() const;
 
-    /// Merge kernels from another registry into this one
-    /// @param other Registry to merge from
-    /// @param priority Priority for merged kernels (default: Normal)
-    /// @return Number of kernels successfully merged
-    std::size_t merge_from(const Registry& other, Priority priority = Priority::Normal);
-
     /// Filter kernels in-place by architecture
-    /// @param gpu_arch Target GPU architecture string (e.g., "gfx942")
-    /// @return Number of kernels removed
     std::size_t filter_by_arch(const std::string& gpu_arch);
 
-    /// Get singleton instance of the global registry (backward compatible)
-    /// This is the default registry used when no specific registry is provided
+    /// Get singleton instance
     static Registry& instance();
 
     private:
-    struct RegistryEntry
-    {
-        KernelInstancePtr instance;
-        Priority priority;
-    };
-
-    /// Perform auto-export if enabled
     void perform_auto_export();
 
-    mutable std::mutex mutex_;
-    std::unordered_map<std::string, RegistryEntry> kernels_;
-    std::string name_;
-
     // Auto-export configuration
     bool auto_export_enabled_ = false;
     std::string auto_export_filename_;
@@ -179,7 +102,7 @@ class Registry
     bool auto_export_on_every_registration_ = true;
 };
 
-/// Shared pointer type for registries (useful for managing lifetime)
+/// Shared pointer type for registries
 using RegistryPtr = std::shared_ptr<Registry>;
 
 /// Create a new registry instance (factory function)
diff --git a/dispatcher/include/ck_tile/dispatcher_conv.hpp b/dispatcher/include/ck_tile/dispatcher_conv.hpp
new file mode 100644
index 0000000000..46d14f90f3
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher_conv.hpp
@@ -0,0 +1,18 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/// Grouped Convolution-only dispatcher header -- minimal include for conv operations.
+
+#pragma once
+
+// Core (needed by all ops)
+#include "ck_tile/dispatcher/base_registry.hpp"
+#include "ck_tile/dispatcher/dispatcher_error.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+// Grouped Convolution
+#include "ck_tile/dispatcher/grouped_conv_config.hpp"
+#include "ck_tile/dispatcher/grouped_conv_problem.hpp"
+#include "ck_tile/dispatcher/grouped_conv_kernel_decl.hpp"
+#include "ck_tile/dispatcher/grouped_conv_registry.hpp"
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
diff --git a/dispatcher/include/ck_tile/dispatcher_gemm.hpp b/dispatcher/include/ck_tile/dispatcher_gemm.hpp
new file mode 100644
index 0000000000..79317c7399
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher_gemm.hpp
@@ -0,0 +1,22 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/// GEMM-only dispatcher header -- minimal include for GEMM operations.
+
+#pragma once
+
+// Core (needed by all ops)
+#include "ck_tile/dispatcher/base_registry.hpp"
+#include "ck_tile/dispatcher/dispatcher_error.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+
+// GEMM
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include "ck_tile/dispatcher/kernel_config.hpp"
+#include "ck_tile/dispatcher/kernel_decl.hpp"
+#include "ck_tile/dispatcher/kernel_instance.hpp"
+#include "ck_tile/dispatcher/problem.hpp"
+#include "ck_tile/dispatcher/registry.hpp"
+#include "ck_tile/dispatcher/dispatcher.hpp"
+#include "ck_tile/dispatcher/json_export.hpp"
+#include "ck_tile/dispatcher/utils.hpp"
diff --git a/dispatcher/python/CMakeLists.txt b/dispatcher/python/CMakeLists.txt
index e57678952e..71634fa926 100644
--- a/dispatcher/python/CMakeLists.txt
+++ b/dispatcher/python/CMakeLists.txt
@@ -3,7 +3,7 @@
 
 # This directory contains Python utilities for the dispatcher examples.
 # The main utility file is ctypes_utils.py which is used by GEMM Python examples.
-# Conv Python examples use their own conv_utils.py in the examples directory.
+# Grouped conv Python examples use grouped_conv_utils.py in this directory.
 
 # No build targets needed - these are pure Python utilities.
 message(STATUS "Python utilities directory configured (no build targets)")
diff --git a/dispatcher/python/README.md b/dispatcher/python/README.md
index 9286acbf72..edbc7acc9d 100644
--- a/dispatcher/python/README.md
+++ b/dispatcher/python/README.md
@@ -4,6 +4,19 @@ This directory contains Python utilities used by the dispatcher examples.
 
 ## Contents
 
+### Shared Utilities (used by both GEMM and Grouped Conv)
+
+- `dispatcher_common.py` - Shared dispatcher infrastructure
+  - Path helpers (`get_dispatcher_root`, `get_build_dir`, etc.)
+  - `ValidationResultBase` - Structured validation feedback
+  - `validate_wave_config`, `validate_warp_tile_config`, `validate_trait_combo`
+  - `auto_correct_wave`, `auto_correct_trait` - Auto-correction helpers
+  - `Colors` - Cross-platform ANSI color support
+  - `print_phase`, `print_success`, `print_error`, `print_info` - Phased output
+  - `cleanup_generated_kernels` - Cleanup helper
+
+### GEMM Utilities
+
 - `ctypes_utils.py` - Core ctypes utilities for GEMM Python examples
   - `KernelConfig` - Kernel configuration dataclass
   - `setup_gemm_dispatcher()` - Setup dispatcher with auto-correction
@@ -11,11 +24,15 @@ This directory contains Python utilities used by the dispatcher examples.
   - `GemmRunner` - GPU execution helper
   - Auto-correction and validation utilities
 
-- `conv_utils.py` - Core utilities for Conv Python examples
-  - `ConvSignature`, `ConvAlgorithm` - Convolution configuration
-  - `ConvProblem` - Problem definition
-  - `GpuConvRunner` - GPU execution helper
-  - `EnhancedConvCodegenRunner` - Kernel codegen utilities
+### Grouped Convolution Utilities
+
+- `grouped_conv_utils.py` - Utilities for grouped convolution
+  - `GroupedConvValidationResult` - Validation result (extends `ValidationResultBase`)
+  - `validate_grouped_conv_config` - Validate a grouped conv config
+  - `auto_correct_grouped_conv_config` - Auto-correct invalid configs
+  - `get_grouped_conv_default_config` - Get default config for a variant
+  - `GroupedConvDataType` - Data type enum (FP16, BF16, FP32, FP8, BF8, INT8)
+  - `format_grouped_conv_summary` - Human-readable config summary
 
 ## Usage
 
@@ -36,21 +53,26 @@ from ctypes_utils import (
 )
 ```
 
-### Conv Examples
-
-The Conv Python examples in `dispatcher/examples/conv/python/` import:
+### Grouped Conv Usage
 
 ```python
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
 
-from conv_utils import (
-    ConvSignature,
-    ConvAlgorithm,
-    ConvProblem,
-    GpuConvRunner,
+from grouped_conv_utils import (
+    validate_grouped_conv_config,
+    auto_correct_grouped_conv_config,
+    get_grouped_conv_default_config,
+    GroupedConvDataType,
 )
+
+# Get a default config
+config = get_grouped_conv_default_config(variant="forward", arch="gfx942")
+
+# Validate
+result = validate_grouped_conv_config(config)
+print(f"Valid: {result.is_valid}")
 ```
 
 ## Requirements
diff --git a/dispatcher/python/ctypes_utils.py b/dispatcher/python/ctypes_utils.py
index 821fc2b08d..c11aaca835 100644
--- a/dispatcher/python/ctypes_utils.py
+++ b/dispatcher/python/ctypes_utils.py
@@ -37,6 +37,43 @@ import multiprocessing
 import time
 
 
+# =============================================================================
+# GPU Architecture Auto-Detection
+# =============================================================================
+
+_detected_arch: Optional[str] = None
+
+
+def detect_gpu_arch(fallback: str = "gfx942") -> str:
+    """
+    Auto-detect the GPU architecture by querying rocminfo.
+
+    Caches the result after the first call. Falls back to `fallback` if
+    detection fails (e.g. no GPU, rocminfo not installed).
+    """
+    global _detected_arch
+    if _detected_arch is not None:
+        return _detected_arch
+
+    try:
+        result = subprocess.run(
+            ["/opt/rocm/bin/rocminfo"], capture_output=True, text=True, timeout=10
+        )
+        for line in result.stdout.splitlines():
+            stripped = line.strip()
+            if stripped.startswith("Name:") and "gfx" in stripped:
+                # Extract e.g. "gfx950" from "Name:                    gfx950"
+                name = stripped.split(":", 1)[1].strip()
+                if name.startswith("gfx") and name[3:].isdigit():
+                    _detected_arch = name
+                    return _detected_arch
+    except Exception:
+        pass
+
+    _detected_arch = fallback
+    return _detected_arch
+
+
 # =============================================================================
 # Path Configuration
 # =============================================================================
@@ -159,9 +196,9 @@ class ValidationResult:
     def print_result(self, indent: str = "  "):
         """Print validation result."""
         if self.is_valid:
-            print(f"{indent}✓ Configuration valid")
+            print(f"{indent}OK Configuration valid")
         else:
-            print(f"{indent}⚠ Configuration has issues:")
+            print(f"{indent}WARNING Configuration has issues:")
             for err in self.errors:
                 print(f"{indent}  - {err}")
 
@@ -300,7 +337,7 @@ def auto_correct_kernel_config(
     # Check each fix and describe what changed
     if "scheduler" in fixes and fixes["scheduler"] != config.scheduler:
         corrections.append(
-            f"Scheduler: {config.scheduler} → {fixes['scheduler']} "
+            f"Scheduler: {config.scheduler} -> {fixes['scheduler']} "
             f"('{config.scheduler}' not supported with pipeline={config.pipeline}, epilogue={config.epilogue})"
         )
 
@@ -309,7 +346,7 @@ def auto_correct_kernel_config(
         new_wave = f"[{fixes.get('wave_m', config.wave_m)}, {fixes.get('wave_n', config.wave_n)}, {fixes.get('wave_k', config.wave_k)}]"
         if old_wave != new_wave:
             corrections.append(
-                f"Wave config: {old_wave} → {new_wave} "
+                f"Wave config: {old_wave} -> {new_wave} "
                 f"(original not supported on {config.gfx_arch})"
             )
 
@@ -318,7 +355,7 @@ def auto_correct_kernel_config(
         new_warp = f"[{fixes.get('warp_m', config.warp_m)}, {fixes.get('warp_n', config.warp_n)}, {fixes.get('warp_k', config.warp_k)}]"
         if old_warp != new_warp:
             corrections.append(
-                f"Warp tile: {old_warp} → {new_warp} "
+                f"Warp tile: {old_warp} -> {new_warp} "
                 f"(original not supported for {config.dtype_a} on {config.gfx_arch})"
             )
 
@@ -386,13 +423,13 @@ def print_auto_correction(
         indent: Indentation for output
     """
     if not corrections:
-        print(f"{indent}✓ Configuration valid - no corrections needed")
+        print(f"{indent}OK Configuration valid - no corrections needed")
         return
 
-    print(f"\n{indent}⚠ AUTO-CORRECTION APPLIED:")
+    print(f"\n{indent}WARNING AUTO-CORRECTION APPLIED:")
     print(f"{indent}" + "-" * 50)
     for correction in corrections:
-        print(f"{indent}  • {correction}")
+        print(f"{indent}  - {correction}")
     print(f"{indent}" + "-" * 50)
     print()
 
@@ -976,6 +1013,226 @@ def _run_codegen_subprocess(args: Dict[str, Any]) -> CodegenResult:
         )
 
 
+def _run_hipcc_subprocess(args: dict) -> Tuple[bool, Optional[Path], str]:
+    """Module-level function to run hipcc compilation in parallel."""
+    import subprocess
+    from pathlib import Path
+
+    compile_cmd = args["compile_cmd"]
+    link_cmd = args["link_cmd"]
+    lib_path = Path(args["lib_path"])
+
+    try:
+        res_c = subprocess.run(compile_cmd, capture_output=True, text=True, timeout=300)
+        if res_c.returncode != 0:
+            return False, None, f"Compile failed: {res_c.stderr[:200]}"
+
+        res_l = subprocess.run(link_cmd, capture_output=True, text=True, timeout=300)
+        if res_l.returncode != 0:
+            return False, None, f"Link failed: {res_l.stderr[:200]}"
+
+        return True, lib_path, ""
+    except subprocess.TimeoutExpired:
+        return False, None, "Timeout"
+    except Exception as e:
+        return False, None, str(e)
+
+
+def _generate_single_kernel_subprocess(args: dict) -> Tuple[bool, Optional[str], str]:
+    """Module-level function: generate ONE kernel .hpp via --config JSON file.
+
+    Used by setup_multiple_gemm_dispatchers for per-config parallel codegen.
+    Returns (success, header_path_or_None, error_msg).
+    """
+    import subprocess
+    import json
+    import tempfile
+    import os
+    from pathlib import Path
+
+    try:
+        out_dir = Path(args["output_dir"])
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        # Write the single-config JSON to a temp file
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(args["tile_config_json"], f)
+            config_file = f.name
+
+        cmd = [
+            args["python"],
+            str(args["codegen_script"]),
+            "--output-dir",
+            str(out_dir),
+            "--datatype",
+            args["dtype"],
+            "--layout",
+            args["layout"],
+            "--gpu-target",
+            args["gpu_target"],
+            "--config",
+            config_file,
+            "--variants",
+            "standard",
+        ]
+
+        res = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+        os.unlink(config_file)
+
+        if res.returncode != 0:
+            return False, None, f"Codegen failed: {res.stderr[:200]}"
+
+        # Find the generated .hpp using the expected name pattern
+        pattern = args["hpp_glob_pattern"]
+        matches = sorted(out_dir.glob(pattern))
+        if matches:
+            return True, str(matches[0]), ""
+        else:
+            return False, None, f"No .hpp matching {pattern} after codegen"
+
+    except Exception as e:
+        return False, None, str(e)
+
+
+def _parse_triplet(text: str) -> Optional[Tuple[int, int, int]]:
+    parts = text.split("x")
+    if len(parts) != 3:
+        return None
+    try:
+        return (int(parts[0]), int(parts[1]), int(parts[2]))
+    except ValueError:
+        return None
+
+
+def _parse_gemm_header_metadata(header: Path) -> Optional[Dict[str, Any]]:
+    """
+    Parse GEMM header name into configuration metadata.
+
+    Expected stem format:
+      gemm_{dtype}_{layout}_{pipeline}_{epilogue}_{scheduler}
+           _{pad_m}_{pad_n}_{pad_k}_{persistent}
+           _{tile_m}x{tile_n}x{tile_k}_{wave_m}x{wave_n}x{wave_k}_{warp_m}x{warp_n}x{warp_k}
+    """
+    parts = header.stem.split("_")
+    if len(parts) < 13 or parts[0] != "gemm":
+        return None
+
+    tile = _parse_triplet(parts[10])
+    wave = _parse_triplet(parts[11])
+    warp = _parse_triplet(parts[12])
+    if tile is None or wave is None or warp is None:
+        return None
+
+    def _as_bool(v: str) -> bool:
+        return v.lower() == "true"
+
+    return {
+        "dtype": parts[1],
+        "layout": parts[2],
+        "pipeline": parts[3],
+        "epilogue": parts[4],
+        "scheduler": parts[5],
+        "pad_m": _as_bool(parts[6]),
+        "pad_n": _as_bool(parts[7]),
+        "pad_k": _as_bool(parts[8]),
+        "persistent": _as_bool(parts[9]),
+        "tile": tile,
+        "wave": wave,
+        "warp": warp,
+    }
+
+
+def _generate_arch_valid_gemm_headers(
+    python_exe: str,
+    codegen_script: Path,
+    output_dir: Path,
+    dtype: str,
+    layout: str,
+    gpu_target: str,
+    variant: str = "standard",
+) -> Tuple[bool, List[Path], str]:
+    """Generate (or reuse) an arch-filtered kernel catalog for fallback selection."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    pattern = f"gemm_{dtype}_{layout}_*.hpp"
+    existing = sorted(output_dir.glob(pattern))
+    if existing:
+        return True, existing, ""
+
+    cmd = [
+        python_exe,
+        str(codegen_script),
+        "--output-dir",
+        str(output_dir),
+        "--datatype",
+        dtype,
+        "--layout",
+        layout,
+        "--gpu-target",
+        gpu_target,
+        "--variants",
+        variant,
+    ]
+    res = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+    if res.returncode != 0:
+        err = (res.stderr or res.stdout or "").strip()[:500]
+        return False, [], f"Catalog codegen failed: {err}"
+
+    generated = sorted(output_dir.glob(pattern))
+    if not generated:
+        return False, [], "Catalog codegen produced no GEMM headers"
+    return True, generated, ""
+
+
+def _select_best_arch_valid_gemm_header(
+    config: "KernelConfig",
+    headers: List[Path],
+) -> Tuple[Optional[Path], Optional[Dict[str, Any]]]:
+    """Choose nearest arch-valid header for a requested GEMM config."""
+    best: Optional[Path] = None
+    best_meta: Optional[Dict[str, Any]] = None
+    best_score: Optional[Tuple[int, int, int, int, int, int]] = None
+
+    for h in headers:
+        meta = _parse_gemm_header_metadata(h)
+        if meta is None:
+            continue
+        if meta["dtype"] != config.dtype_a or meta["layout"] != config.layout:
+            continue
+
+        tile = meta["tile"]
+        wave = meta["wave"]
+        warp = meta["warp"]
+        tile_delta = (
+            abs(tile[0] - config.tile_m)
+            + abs(tile[1] - config.tile_n)
+            + abs(tile[2] - config.tile_k)
+        )
+        wave_delta = (
+            abs(wave[0] - config.wave_m)
+            + abs(wave[1] - config.wave_n)
+            + abs(wave[2] - config.wave_k)
+        )
+        warp_delta = (
+            abs(warp[0] - config.warp_m)
+            + abs(warp[1] - config.warp_n)
+            + abs(warp[2] - config.warp_k)
+        )
+        score = (
+            0 if meta["pipeline"] == config.pipeline else 1,
+            0 if meta["scheduler"] == config.scheduler else 1,
+            0 if meta["epilogue"] == config.epilogue else 1,
+            tile_delta,
+            wave_delta,
+            warp_delta,
+        )
+        if best_score is None or score < best_score:
+            best_score = score
+            best = h
+            best_meta = meta
+
+    return best, best_meta
+
+
 # =============================================================================
 # Preshuffle Utilities
 # =============================================================================
@@ -1319,7 +1576,7 @@ class CodegenRunner:
                     result = future.result()
                     results.append(result)
                     if verbose:
-                        status = "✓" if result.success else "✗"
+                        status = "OK" if result.success else "FAIL"
                         print(
                             f"  {status} {variant}: {result.kernel_count} kernels in {result.elapsed_seconds:.2f}s"
                         )
@@ -1337,7 +1594,7 @@ class CodegenRunner:
                         )
                     )
                     if verbose:
-                        print(f"  ✗ {variant}: FAILED - {e}")
+                        print(f"  FAIL {variant}: FAILED - {e}")
 
         total_time = time.time() - start_total
         if verbose:
@@ -1399,7 +1656,7 @@ class CodegenRunner:
                     result = future.result()
                     results.append(result)
                     if verbose:
-                        status = "✓" if result.success else "✗"
+                        status = "OK" if result.success else "FAIL"
                         print(
                             f"  {status} {tile_str}: {result.kernel_count} kernels in {result.elapsed_seconds:.2f}s"
                         )
@@ -1417,7 +1674,7 @@ class CodegenRunner:
                         )
                     )
                     if verbose:
-                        print(f"  ✗ {tile_str}: FAILED - {e}")
+                        print(f"  FAIL {tile_str}: FAILED - {e}")
 
         total_time = time.time() - start_total
         if verbose:
@@ -1481,7 +1738,7 @@ class CodegenRunner:
                     result = future.result()
                     results.append(result)
                     if verbose:
-                        status = "✓" if result.success else "✗"
+                        status = "OK" if result.success else "FAIL"
                         print(
                             f"  {status} {variant}: {result.kernel_count} kernels in {result.elapsed_seconds:.2f}s"
                         )
@@ -1499,7 +1756,7 @@ class CodegenRunner:
                         )
                     )
                     if verbose:
-                        print(f"  ✗ {variant}: FAILED - {e}")
+                        print(f"  FAIL {variant}: FAILED - {e}")
 
         total_time = time.time() - start_total
         if verbose:
@@ -1767,7 +2024,7 @@ class CodegenRunner:
                 link_cmd, capture_output=True, text=True, timeout=300
             )
             if result.returncode == 0:
-                print(f"  ✓ Library rebuilt: {lib_path.name}")
+                print(f"  OK Library rebuilt: {lib_path.name}")
                 # Clean up object file
                 obj_file.unlink(missing_ok=True)
                 return lib_path
@@ -1781,6 +2038,105 @@ class CodegenRunner:
             print(f"  Build error: {e}")
             return None
 
+    def build_libraries_parallel(
+        self, configs_and_headers: List[Tuple[KernelConfig, Path]], verbose: bool = True
+    ) -> List[Optional[Path]]:
+        """
+        Build multiple libraries in parallel using ProcessPoolExecutor.
+        Returns a list of library paths (or None if a build failed) in the same order.
+        """
+        import time
+        from concurrent.futures import ProcessPoolExecutor, as_completed
+
+        start_time = time.time()
+        build_dir = get_build_dir()
+        root = get_dispatcher_root()
+        ck_root = root.parent
+        ctypes_source = root / "bindings/ctypes/gemm_ctypes_lib.cpp"
+        static_lib = build_dir / "libck_tile_dispatcher.a"
+
+        if not ctypes_source.exists() or not static_lib.exists():
+            if verbose:
+                print("  Required source or static library missing for parallel build.")
+            return [None] * len(configs_and_headers)
+
+        args_list = []
+        for config, kernel_header in configs_and_headers:
+            lib_name = f"libdispatcher_gemm_{config.dtype_a}_{config.layout}_{config.tile_str}_{config.pipeline}.so"
+            lib_path = build_dir / "examples" / lib_name
+            obj_file = lib_path.with_suffix(".o")
+
+            compile_cmd = [
+                "/opt/rocm/bin/hipcc",
+                "-c",
+                "-fPIC",
+                "-O3",
+                f"-I{root / 'include'}",
+                f"-I{ck_root / 'include'}",
+                f"-I{ck_root}",
+                f"-I{root / 'build/generated_kernels'}",
+                "-DCK_TILE_SINGLE_KERNEL_INCLUDE",
+                f"-include{kernel_header}",
+                "-D__HIP_PLATFORM_AMD__",
+                f"--offload-arch={config.gfx_arch}",
+                f'-DGFX_ARCH="{config.gfx_arch}"',
+                "-mllvm",
+                "-enable-noalias-to-md-conversion=0",
+                "-Wno-undefined-func-template",
+                "-Wno-float-equal",
+                str(ctypes_source),
+                "-o",
+                str(obj_file),
+            ]
+
+            link_cmd = [
+                "/opt/rocm/bin/hipcc",
+                "-shared",
+                "-fPIC",
+                f"--offload-arch={config.gfx_arch}",
+                "--hip-link",
+                str(obj_file),
+                str(static_lib),
+                "-o",
+                str(lib_path),
+            ]
+
+            args_list.append(
+                {
+                    "compile_cmd": compile_cmd,
+                    "link_cmd": link_cmd,
+                    "lib_path": str(lib_path),
+                    "config_name": f"{config.dtype_a}_{config.layout}_{config.tile_str}",
+                }
+            )
+
+        if verbose:
+            print(
+                f"Building {len(args_list)} libraries in parallel (workers={self.max_workers})..."
+            )
+
+        results_map = {}
+        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = {
+                executor.submit(_run_hipcc_subprocess, args): i
+                for i, args in enumerate(args_list)
+            }
+            for future in as_completed(futures):
+                idx = futures[future]
+                success, lib_path, err = future.result()
+                results_map[idx] = Path(lib_path) if success else None
+                if verbose:
+                    status = "OK" if success else f"FAIL ({err})"
+                    print(
+                        f"  {status} {Path(lib_path).name if success else args_list[idx]['config_name']}"
+                    )
+
+        if verbose:
+            elapsed = time.time() - start_time
+            print(f"Parallel build finished in {elapsed:.2f}s")
+
+        return [results_map[i] for i in range(len(configs_and_headers))]
+
     def generate_preselected(
         self, preset: str = "fp16_rcr_essential", output_dir: Optional[Path] = None
     ) -> CodegenResult:
@@ -1933,6 +2289,28 @@ class Registry:
         """Bind to a loaded dispatcher library."""
         self._lib = lib
 
+    def build(
+        self,
+        verbose: bool = False,
+        max_workers: Optional[int] = None,
+    ) -> List["GemmSetupResult"]:
+        """Parallel JIT compile all kernels in this registry.
+
+        Args:
+            verbose:     Print progress during build.
+            max_workers: Max parallel codegen/compile processes (default: cpu_count capped at 8).
+
+        Returns a GemmSetupResult per registered kernel (same order as get_kernels()).
+        """
+        if not self._kernels:
+            return []
+        return setup_multiple_gemm_dispatchers(
+            self._kernels,
+            registry_name=self._name,
+            verbose=verbose,
+            max_workers=max_workers,
+        )
+
     def __repr__(self) -> str:
         return f"Registry(name='{self._name}', kernels={self.kernel_count})"
 
@@ -2109,7 +2487,7 @@ def setup_gemm_dispatcher(
     log("  Validating config...")
     validation = validate_kernel_config(config)
     if not validation.is_valid:
-        log("  ⚠ Auto-correcting configuration...")
+        log("  WARNING Auto-correcting configuration...")
         config, was_modified, corrections = auto_correct_kernel_config(
             config, verbose=verbose
         )
@@ -2128,13 +2506,13 @@ def setup_gemm_dispatcher(
 
     codegen_result = codegen.generate_from_config(config)
     if not codegen_result.success:
-        log("  ⚠ Kernel generation: using existing")
+        log("  WARNING Kernel generation: using existing")
 
     # Step 3: Find matching kernel header
     kernel_header = find_matching_kernel_header(config)
     result.kernel_header = kernel_header
     if not kernel_header:
-        log("  ⚠ No matching kernel header found")
+        log("  WARNING No matching kernel header found")
 
     # Step 4: Load library
     log("  Loading library...")
@@ -2188,11 +2566,11 @@ def setup_gemm_dispatcher(
                     result.error = "Failed to load rebuilt library"
                     return result
                 result.lib = lib
-                log(f"  ✓ Rebuilt library: {lib.get_kernel_name()}")
+                log(f"  OK Rebuilt library: {lib.get_kernel_name()}")
             else:
-                log("  ⚠ Rebuild failed, using existing library")
+                log("  WARNING Rebuild failed, using existing library")
         else:
-            log("  ⚠ No kernel header found for config, using existing library")
+            log("  WARNING No kernel header found for config, using existing library")
 
     # Step 5: Create registry and dispatcher
     log("  Creating registry and dispatcher...")
@@ -2203,12 +2581,305 @@ def setup_gemm_dispatcher(
     dispatcher = Dispatcher(registry=registry, lib=lib)
     result.dispatcher = dispatcher
 
-    log(f"  ✓ Ready: {lib.get_kernel_name()}")
+    log(f"  OK Ready: {lib.get_kernel_name()}")
 
     result.success = True
     return result
 
 
+def setup_multiple_gemm_dispatchers(
+    configs: List[KernelConfig],
+    registry_name: str = "gemm_registry",
+    verbose: bool = True,
+    max_workers: Optional[int] = None,
+) -> List[GemmSetupResult]:
+    """
+    Setup multiple GEMM dispatchers in parallel.
+
+    Pipeline:
+      1. Validate + auto-correct each config
+      2. Parallel codegen: generate .hpp for each config via --config JSON
+      3. Parallel hipcc: compile each .hpp -> .so
+      4. Load + wire up each .so into a GemmSetupResult
+
+    Each config gets its own .so, so different tile sizes can coexist.
+
+    Args:
+        max_workers: Max parallel processes for codegen/compile (default: cpu_count capped at 8).
+    """
+    import sys
+
+    results = [GemmSetupResult(success=False, config=c) for c in configs]
+    max_workers = max_workers or min(multiprocessing.cpu_count(), 8)
+
+    # -- Step 1: Validate & correct ---------------------------------------
+    valid_configs = []
+    for i, c in enumerate(configs):
+        val = validate_kernel_config(c)
+        if not val.is_valid:
+            c, modified, corrections = auto_correct_kernel_config(c, verbose=False)
+            results[i].config = c
+            results[i].corrections = corrections
+        valid_configs.append(c)
+
+    # -- Step 2: Parallel codegen (one --config JSON per config) ----------
+    codegen_script = get_codegen_path()
+    output_dir = get_generated_kernels_dir()
+
+    codegen_args = []
+    for c in valid_configs:
+        tile_str = c.tile_str
+        wave_str = f"{c.wave_m}x{c.wave_n}x{c.wave_k}"
+        warp_str = f"{c.warp_m}x{c.warp_n}x{c.warp_k}"
+
+        tile_config_json = {
+            "tile_config": {
+                "tile_m": [c.tile_m],
+                "tile_n": [c.tile_n],
+                "tile_k": [c.tile_k],
+                "warp_m": [c.wave_m],
+                "warp_n": [c.wave_n],
+                "warp_k": [c.wave_k],
+                "warp_tile_m": [c.warp_m],
+                "warp_tile_n": [c.warp_n],
+                "warp_tile_k": [c.warp_k],
+            },
+            "trait_config": {
+                "pipeline": [c.pipeline],
+                "epilogue": [c.epilogue],
+                "scheduler": [c.scheduler],
+                "pad_m": [c.pad_m],
+                "pad_n": [c.pad_n],
+                "pad_k": [c.pad_k],
+                "persistent": [False],
+            },
+        }
+
+        hpp_pattern = (
+            f"gemm_{c.dtype_a}_{c.layout}_{c.pipeline}_{c.epilogue}_{c.scheduler}"
+            f"_*_{tile_str}_{wave_str}_{warp_str}.hpp"
+        )
+
+        codegen_args.append(
+            {
+                "python": sys.executable,
+                "codegen_script": str(codegen_script),
+                "output_dir": str(output_dir),
+                "dtype": c.dtype_a,
+                "layout": c.layout,
+                "gpu_target": c.gfx_arch,
+                "tile_config_json": tile_config_json,
+                "hpp_glob_pattern": hpp_pattern,
+            }
+        )
+
+    if verbose:
+        print(
+            f"Generating {len(codegen_args)} kernel headers in parallel (workers={max_workers})..."
+        )
+
+    headers: List[Optional[Path]] = [None] * len(valid_configs)
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(_generate_single_kernel_subprocess, a): i
+            for i, a in enumerate(codegen_args)
+        }
+        for future in as_completed(futures):
+            idx = futures[future]
+            ok, hdr_str, err = future.result()
+            if ok and hdr_str:
+                headers[idx] = Path(hdr_str)
+                results[idx].kernel_header = Path(hdr_str)
+                if verbose:
+                    print(
+                        f"  OK [{idx}] {valid_configs[idx].tile_str}: {Path(hdr_str).name}"
+                    )
+            else:
+                results[idx].error = f"Codegen: {err}"
+                if verbose:
+                    print(f"  FAIL [{idx}] {valid_configs[idx].tile_str}: {err}")
+
+    # For configs rejected by arch filter, map to nearest arch-valid header.
+    fallback_needed = [i for i, h in enumerate(headers) if h is None]
+    if fallback_needed:
+        if verbose:
+            print(
+                f"Resolving {len(fallback_needed)} configs via arch-valid GEMM catalog..."
+            )
+
+        catalog_cache: Dict[Tuple[str, str, str, str], List[Path]] = {}
+        for i in fallback_needed:
+            c = valid_configs[i]
+            key = (c.gfx_arch, c.dtype_a, c.layout, c.variant)
+            if key not in catalog_cache:
+                catalog_dir = (
+                    output_dir
+                    / "_arch_valid_catalog"
+                    / (f"{c.gfx_arch}_{c.dtype_a}_{c.layout}_{c.variant}")
+                )
+                ok, catalog_headers, err = _generate_arch_valid_gemm_headers(
+                    python_exe=sys.executable,
+                    codegen_script=codegen_script,
+                    output_dir=catalog_dir,
+                    dtype=c.dtype_a,
+                    layout=c.layout,
+                    gpu_target=c.gfx_arch,
+                    variant=c.variant,
+                )
+                if not ok:
+                    catalog_headers = []
+                    if verbose:
+                        print(f"  FAIL [{i}] catalog generation: {err}")
+                catalog_cache[key] = catalog_headers
+
+            chosen, meta = _select_best_arch_valid_gemm_header(c, catalog_cache[key])
+            if chosen is None or meta is None:
+                continue
+
+            headers[i] = chosen
+            results[i].kernel_header = chosen
+            results[i].error = ""
+
+            # Keep Python-side config aligned with the selected kernel header.
+            valid_configs[i].pipeline = str(meta["pipeline"])
+            valid_configs[i].epilogue = str(meta["epilogue"])
+            valid_configs[i].scheduler = str(meta["scheduler"])
+            valid_configs[i].pad_m = bool(meta["pad_m"])
+            valid_configs[i].pad_n = bool(meta["pad_n"])
+            valid_configs[i].pad_k = bool(meta["pad_k"])
+            valid_configs[i].tile_m = int(meta["tile"][0])
+            valid_configs[i].tile_n = int(meta["tile"][1])
+            valid_configs[i].tile_k = int(meta["tile"][2])
+            valid_configs[i].wave_m = int(meta["wave"][0])
+            valid_configs[i].wave_n = int(meta["wave"][1])
+            valid_configs[i].wave_k = int(meta["wave"][2])
+            valid_configs[i].warp_m = int(meta["warp"][0])
+            valid_configs[i].warp_n = int(meta["warp"][1])
+            valid_configs[i].warp_k = int(meta["warp"][2])
+            results[i].config = valid_configs[i]
+
+            if verbose:
+                print(f"  INFO [{i}] mapped to arch-valid header: {chosen.name}")
+
+    # -- Step 3: Parallel hipcc compilation -------------------------------
+    root = get_dispatcher_root()
+    ck_root = root.parent
+    build_dir = get_build_dir()
+    ctypes_source = root / "bindings" / "ctypes" / "gemm_ctypes_lib.cpp"
+    static_lib = build_dir / "libck_tile_dispatcher.a"
+
+    if not ctypes_source.exists() or not static_lib.exists():
+        for i in range(len(valid_configs)):
+            if results[i].error == "":
+                results[
+                    i
+                ].error = "Missing ctypes source or static library for compilation"
+        return results
+
+    compile_jobs = []
+    compile_index_map = {}
+    for i, c in enumerate(valid_configs):
+        hdr = headers[i]
+        if hdr is None:
+            continue
+
+        lib_name = (
+            f"libdispatcher_gemm_{c.dtype_a}_{c.layout}_{c.tile_str}_{c.pipeline}.so"
+        )
+        lib_path = build_dir / "examples" / lib_name
+        obj_file = lib_path.with_suffix(".o")
+
+        compile_cmd = [
+            "/opt/rocm/bin/hipcc",
+            "-c",
+            "-fPIC",
+            "-O3",
+            f"-I{root / 'include'}",
+            f"-I{ck_root / 'include'}",
+            f"-I{ck_root}",
+            f"-I{str(output_dir)}",
+            "-DCK_TILE_SINGLE_KERNEL_INCLUDE",
+            f"-include{hdr}",
+            "-D__HIP_PLATFORM_AMD__",
+            f"--offload-arch={c.gfx_arch}",
+            f'-DGFX_ARCH="{c.gfx_arch}"',
+            "-mllvm",
+            "-enable-noalias-to-md-conversion=0",
+            "-Wno-undefined-func-template",
+            "-Wno-float-equal",
+            str(ctypes_source),
+            "-o",
+            str(obj_file),
+        ]
+        link_cmd = [
+            "/opt/rocm/bin/hipcc",
+            "-shared",
+            "-fPIC",
+            f"--offload-arch={c.gfx_arch}",
+            "--hip-link",
+            str(obj_file),
+            str(static_lib),
+            "-o",
+            str(lib_path),
+        ]
+
+        compile_index_map[len(compile_jobs)] = i
+        compile_jobs.append(
+            {
+                "compile_cmd": compile_cmd,
+                "link_cmd": link_cmd,
+                "lib_path": str(lib_path),
+            }
+        )
+
+    if verbose and compile_jobs:
+        print(
+            f"Compiling {len(compile_jobs)} libraries in parallel (workers={max_workers})..."
+        )
+
+    lib_paths: Dict[int, Optional[Path]] = {}
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(_run_hipcc_subprocess, job): j
+            for j, job in enumerate(compile_jobs)
+        }
+        for future in as_completed(futures):
+            j = futures[future]
+            i = compile_index_map[j]
+            ok, lp, err = future.result()
+            if ok and lp:
+                lib_paths[i] = Path(lp)
+                if verbose:
+                    print(f"  OK [{i}] {valid_configs[i].tile_str}: {Path(lp).name}")
+            else:
+                results[i].error = f"Compile: {err}"
+                if verbose:
+                    print(f"  FAIL [{i}] {valid_configs[i].tile_str}: {err}")
+
+    # -- Step 4: Load libraries and create dispatchers --------------------
+    for i, c in enumerate(valid_configs):
+        lp = lib_paths.get(i)
+        if lp is None:
+            continue
+
+        lib = DispatcherLib.load(lp)
+        if lib is not None and lib.initialize():
+            results[i].lib = lib
+            reg = Registry(name=f"{registry_name}_{i}", lib=lib)
+            reg.register_kernel(c)
+            results[i].registry = reg
+            results[i].dispatcher = Dispatcher(registry=reg, lib=lib)
+            results[i].success = True
+        else:
+            results[i].error = "Failed to load compiled library"
+
+    if verbose:
+        ok_count = sum(1 for r in results if r.success)
+        print(f"Setup complete: {ok_count}/{len(results)} dispatchers ready")
+
+    return results
+
+
 def cleanup_gemm():
     """
     Cleanup function to call after running GEMM examples.
diff --git a/dispatcher/python/dispatcher_common.py b/dispatcher/python/dispatcher_common.py
new file mode 100644
index 0000000000..a19ecbdb49
--- /dev/null
+++ b/dispatcher/python/dispatcher_common.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Shared Python dispatcher utilities for GEMM and grouped convolution.
+
+Extracted from ctypes_utils.py (GEMM) + compile_grouped_conv_examples.py (grouped conv).
+Both ctypes_utils.py and grouped_conv_utils.py import from here to
+eliminate duplication.
+
+Best-of-both:
+  - Validation and auto-correction return typed objects (GEMM pattern)
+  - Colors class with cross-platform ANSI handling (conv pattern)
+  - Phased output helpers (conv pattern)
+  - logging module instead of bare print() (shared improvement)
+"""
+
+import logging
+import shutil
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+log = logging.getLogger(__name__)
+
+
+# ============================================================================
+# Path Configuration
+# ============================================================================
+
+
+def get_dispatcher_root() -> Path:
+    """Get the dispatcher root directory (parent of python/)."""
+    return Path(__file__).parent.parent
+
+
+def get_ck_root() -> Path:
+    """Get the CK root directory (parent of dispatcher/)."""
+    return get_dispatcher_root().parent
+
+
+def get_build_dir() -> Path:
+    """Get the build directory."""
+    return get_dispatcher_root() / "build"
+
+
+def get_generated_kernels_dir() -> Path:
+    """Get the generated kernels directory."""
+    return get_build_dir() / "generated_kernels"
+
+
+def get_codegen_dir() -> Path:
+    """Get the codegen scripts directory."""
+    return get_dispatcher_root() / "codegen"
+
+
+# ============================================================================
+# Architecture Filter Data
+# ============================================================================
+
+_arch_data_cache: Optional[Dict[str, Any]] = None
+
+
+def detect_gpu_arch(fallback: str = "gfx942") -> str:
+    """Detect the GPU architecture from rocminfo. Falls back to the given default."""
+    import subprocess
+
+    try:
+        out = subprocess.check_output(
+            ["rocminfo"], text=True, stderr=subprocess.DEVNULL
+        )
+        for line in out.splitlines():
+            if "Name:" in line and "gfx" in line:
+                return line.split()[-1].strip()
+    except Exception:
+        pass
+    return fallback
+
+
+def get_arch_filter_data() -> Dict[str, Any]:
+    """Load arch filter data from arch_specs_generated if available.
+
+    Returns dict with keys: trait_unsupported, warp_combos,
+    warp_tile_combos, supported_archs.
+    """
+    global _arch_data_cache
+    if _arch_data_cache is not None:
+        return _arch_data_cache
+
+    codegen_dir = get_dispatcher_root() / "codegen"
+    sys.path.insert(0, str(codegen_dir))
+
+    try:
+        from arch_specs_generated import (
+            TRAIT_UNSUPPORTED_COMBINATIONS,
+            WARP_SUPPORTED_COMBINATIONS,
+            WARP_TILE_SUPPORTED_COMBINATIONS,
+            get_supported_archs,
+        )
+
+        _arch_data_cache = {
+            "trait_unsupported": TRAIT_UNSUPPORTED_COMBINATIONS,
+            "warp_combos": WARP_SUPPORTED_COMBINATIONS,
+            "warp_tile_combos": WARP_TILE_SUPPORTED_COMBINATIONS,
+            "supported_archs": get_supported_archs(),
+        }
+    except ImportError:
+        _arch_data_cache = {
+            "trait_unsupported": {
+                ("compv3", "cshuffle", "interwave"),
+                ("compv3", "default", "interwave"),
+                ("compv4", "cshuffle", "interwave"),
+                ("compv4", "default", "interwave"),
+            },
+            "warp_combos": {
+                "gfx942": [[1, 4, 1], [2, 2, 1], [4, 1, 1]],
+                "gfx90a": [[1, 4, 1], [2, 2, 1], [4, 1, 1]],
+            },
+            "warp_tile_combos": {
+                "gfx942": {"fp16_fp16_fp32": [[16, 16, 16], [32, 32, 16]]},
+                "gfx90a": {"fp16_fp16_fp32": [[16, 16, 16], [32, 32, 16]]},
+            },
+            "supported_archs": ["gfx90a", "gfx942", "gfx950"],
+        }
+
+    return _arch_data_cache
+
+
+# ============================================================================
+# Validation Result
+# ============================================================================
+
+
+@dataclass
+class ValidationResultBase:
+    """Result of kernel config validation (shared base for GEMM and conv)."""
+
+    is_valid: bool
+    errors: List[str] = field(default_factory=list)
+    warnings: List[str] = field(default_factory=list)
+    suggested_fixes: Dict[str, Any] = field(default_factory=dict)
+
+    def print_result(self, indent: str = "  "):
+        if self.is_valid:
+            print(f"{indent}OK Configuration valid")
+        else:
+            print(f"{indent}WARNING Configuration has issues:")
+            for err in self.errors:
+                print(f"{indent}  - {err}")
+        if self.warnings:
+            for warn in self.warnings:
+                print(f"{indent}  Warning: {warn}")
+        if self.suggested_fixes:
+            print(f"{indent}  Suggested fixes:")
+            for key, val in self.suggested_fixes.items():
+                print(f"{indent}    {key}: {val}")
+
+
+# ============================================================================
+# Validation Helpers
+# ============================================================================
+
+
+def validate_wave_config(wave_cfg: List[int], arch: str) -> Tuple[bool, str]:
+    """Validate a [wave_m, wave_n, wave_k] config for *arch*.
+
+    Returns (is_valid, error_message). Empty string on success.
+    """
+    data = get_arch_filter_data()
+    valid_waves = data["warp_combos"].get(arch, [[2, 2, 1]])
+    if wave_cfg in valid_waves:
+        return True, ""
+    valid_str = ", ".join(f"[{c[0]},{c[1]},{c[2]}]" for c in valid_waves)
+    return (
+        False,
+        f"Unsupported wave configuration {wave_cfg} for {arch}. "
+        f"Valid wave configs: {valid_str}",
+    )
+
+
+def validate_warp_tile_config(
+    warp_cfg: List[int], arch: str, dtype: str
+) -> Tuple[bool, str]:
+    """Validate a [warp_m, warp_n, warp_k] config for *arch*/*dtype*.
+
+    Returns (is_valid, error_message). Empty string on success.
+    """
+    data = get_arch_filter_data()
+    acc = "int32" if dtype == "int8" else "fp32"
+    dtype_key = f"{dtype}_{dtype}_{acc}"
+    valid_tiles = (
+        data["warp_tile_combos"]
+        .get(arch, {})
+        .get(dtype_key, [[32, 32, 16], [16, 16, 16]])
+    )
+    if warp_cfg in valid_tiles:
+        return True, ""
+    valid_str = ", ".join(f"[{c[0]},{c[1]},{c[2]}]" for c in valid_tiles[:5])
+    return (
+        False,
+        f"Unsupported warp tile {warp_cfg} for {arch}/{dtype}. "
+        f"Valid warp tiles: {valid_str}",
+    )
+
+
+def validate_trait_combo(
+    pipeline: str, epilogue: str, scheduler: str
+) -> Tuple[bool, str]:
+    """Validate a (pipeline, epilogue, scheduler) combination.
+
+    Returns (is_valid, error_message). Empty string on success.
+    """
+    data = get_arch_filter_data()
+    combo = (pipeline, epilogue, scheduler)
+    if combo in data["trait_unsupported"]:
+        return (
+            False,
+            f"Unsupported trait combination: pipeline={pipeline}, "
+            f"epilogue={epilogue}, scheduler={scheduler}",
+        )
+    return True, ""
+
+
+# ============================================================================
+# Auto-Correction Helpers
+# ============================================================================
+
+
+def auto_correct_wave(wave_cfg: List[int], arch: str) -> List[int]:
+    """Return the first valid wave config for *arch*.
+
+    If *wave_cfg* is already valid, returns it unchanged.
+    """
+    data = get_arch_filter_data()
+    valid_waves = data["warp_combos"].get(arch, [[2, 2, 1]])
+    if wave_cfg in valid_waves:
+        return wave_cfg
+    return valid_waves[0] if valid_waves else [2, 2, 1]
+
+
+def auto_correct_trait(pipeline: str, scheduler: str) -> Tuple[str, str]:
+    """Return a corrected (pipeline, scheduler) pair.
+
+    If the compute pipeline doesn't support interwave, switch to intrawave.
+    """
+    data = get_arch_filter_data()
+    for epilogue in ("cshuffle", "default"):
+        if (pipeline, epilogue, scheduler) in data["trait_unsupported"]:
+            return pipeline, "intrawave"
+    return pipeline, scheduler
+
+
+# ============================================================================
+# Colors (adopted from compile_grouped_conv_examples.py -- cross-platform)
+# ============================================================================
+
+
+class Colors:
+    """Cross-platform ANSI color support.
+
+    Respects sys.platform (no ANSI on Windows) and isatty() check so
+    piped/redirected output stays clean.
+    """
+
+    _GREEN = "\033[0;32m"
+    _YELLOW = "\033[1;33m"
+    _RED = "\033[0;31m"
+    _CYAN = "\033[0;36m"
+    _BOLD = "\033[1m"
+    _NC = "\033[0m"
+
+    @classmethod
+    def _use_color(cls) -> bool:
+        return (
+            sys.platform != "win32"
+            and hasattr(sys.stdout, "isatty")
+            and sys.stdout.isatty()
+        )
+
+    @classmethod
+    def green(cls, text: str) -> str:
+        if cls._use_color():
+            return f"{cls._GREEN}{text}{cls._NC}"
+        return text
+
+    @classmethod
+    def red(cls, text: str) -> str:
+        if cls._use_color():
+            return f"{cls._RED}{text}{cls._NC}"
+        return text
+
+    @classmethod
+    def yellow(cls, text: str) -> str:
+        if cls._use_color():
+            return f"{cls._YELLOW}{text}{cls._NC}"
+        return text
+
+    @classmethod
+    def cyan(cls, text: str) -> str:
+        if cls._use_color():
+            return f"{cls._CYAN}{text}{cls._NC}"
+        return text
+
+    @classmethod
+    def bold(cls, text: str) -> str:
+        if cls._use_color():
+            return f"{cls._BOLD}{text}{cls._NC}"
+        return text
+
+
+# ============================================================================
+# Phased Output Helpers
+# ============================================================================
+
+
+def print_phase(number: int, description: str) -> None:
+    """Print a phase header (e.g. 'Phase 1: Codegen')."""
+    print(f"\n{'=' * 60}")
+    print(f"  Phase {number}: {description}")
+    print(f"{'=' * 60}")
+
+
+def print_success(message: str) -> None:
+    """Print a success message."""
+    print(f"  OK {Colors.green(message)}")
+
+
+def print_error(message: str) -> None:
+    """Print an error message."""
+    print(f"  FAIL {Colors.red(message)}")
+
+
+def print_info(message: str) -> None:
+    """Print an info message."""
+    print(f"  {Colors.cyan(message)}")
+
+
+# ============================================================================
+# Cleanup Helpers
+# ============================================================================
+
+
+def cleanup_generated_kernels(gen_dir: Optional[Path] = None) -> None:
+    """Remove generated kernel directory if it exists."""
+    if gen_dir is None:
+        gen_dir = get_generated_kernels_dir()
+    if gen_dir.exists():
+        shutil.rmtree(gen_dir, ignore_errors=True)
+        log.info("Cleaned up generated kernels at %s", gen_dir)
+
+
+# ============================================================================
+# Tool Helpers
+# ============================================================================
+
+
+def find_hipcc() -> Optional[str]:
+    """Find the hipcc compiler."""
+    import os
+
+    candidates = [
+        os.environ.get("HIPCC"),
+        "/opt/rocm/bin/hipcc",
+        shutil.which("hipcc"),
+    ]
+    for path in candidates:
+        if path and os.path.isfile(path):
+            return path
+    return None
diff --git a/dispatcher/python/grouped_conv_utils.py b/dispatcher/python/grouped_conv_utils.py
new file mode 100644
index 0000000000..cd6ef5647c
--- /dev/null
+++ b/dispatcher/python/grouped_conv_utils.py
@@ -0,0 +1,1806 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Grouped Convolution Dispatcher Utilities
+
+Typed Python API for grouped convolution kernels, matching the patterns from
+the old conv_utils.py and the GEMM ctypes_utils.py.
+
+Classes:
+    GroupedConvKernelConfig  - Kernel configuration (tile, wave, pipeline, arch)
+    GroupedConvProblem       - Runtime problem specification (N,C,K,H,W,etc.)
+    GroupedConvProblemC      - ctypes struct matching C++ ConvProblemC
+    GroupedConvDispatcherLib - Wrapper for libdispatcher_conv_lib.so
+    GpuGroupedConvRunner    - High-level GPU execution runner
+    GroupedConvResult        - Result of GPU execution (output, time, tflops)
+    GroupedConvRegistry      - Collection of kernel configs with JSON export
+
+Usage:
+    from grouped_conv_utils import (
+        GroupedConvKernelConfig,
+        GroupedConvProblem,
+        GpuGroupedConvRunner,
+    )
+
+    config = GroupedConvKernelConfig(variant="forward", ndim_spatial=2)
+    problem = GroupedConvProblem(N=1, C=64, K=128, Hi=28, Wi=28, Y=3, X=3,
+                                 stride_h=1, pad_h=1, direction="forward")
+    runner = GpuGroupedConvRunner()
+    if runner.is_available():
+        result = runner.run(input_np, weight_np, problem)
+        print(f"Time: {result.time_ms:.4f} ms, TFLOPS: {result.tflops:.2f}")
+"""
+
+import ctypes
+import json
+import copy
+import subprocess
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from dispatcher_common import (
+    ValidationResultBase,
+    auto_correct_trait,
+    auto_correct_wave,
+    get_arch_filter_data,
+    validate_trait_combo,
+    validate_wave_config,
+    validate_warp_tile_config,
+)
+
+
+# =============================================================================
+# Constants
+# =============================================================================
+
+VALID_VARIANTS = ("forward", "bwd_data", "bwd_weight")
+VALID_NDIM_SPATIAL = (1, 2, 3)
+BACKWARD_VARIANTS = ("bwd_data", "bwd_weight")
+BACKWARD_PIPELINES = ("compv3", "mem")
+
+VARIANT_ALIASES = {
+    "2d_fwd": "forward",
+    "2d_bwdd": "bwd_data",
+    "2d_bwdw": "bwd_weight",
+    "fwd": "forward",
+    "bwdd": "bwd_data",
+    "bwdw": "bwd_weight",
+}
+
+DIRECTION_MAP = {"forward": 0, "bwd_data": 1, "bwd_weight": 2}
+
+
+def _resolve_variant(v: str) -> str:
+    return VARIANT_ALIASES.get(v, v)
+
+
+# =============================================================================
+# GroupedConvDataType
+# =============================================================================
+
+
+class GroupedConvDataType(Enum):
+    FP16 = "fp16"
+    BF16 = "bf16"
+    FP32 = "fp32"
+    FP8 = "fp8"
+    BF8 = "bf8"
+    INT8 = "int8"
+
+
+# =============================================================================
+# GroupedConvKernelConfig
+# =============================================================================
+
+
+@dataclass
+class GroupedConvKernelConfig:
+    """Complete kernel configuration for grouped convolution.
+
+    Captures all parameters needed to identify and run a specific kernel.
+    Mirrors the C++ GroupedConvSignature + GroupedConvAlgorithm.
+    """
+
+    # What: signature
+    variant: str = "forward"
+    ndim_spatial: int = 2
+    dtype: str = "fp16"
+    layout: str = "nhwgc"
+    arch: str = "gfx942"
+
+    # How: algorithm - tile shape
+    tile_m: int = 1
+    tile_n: int = 128
+    tile_k: int = 128
+
+    # How: wave config
+    wave_m: int = 2
+    wave_n: int = 2
+    wave_k: int = 1
+
+    # How: warp tile
+    warp_tile_m: int = 32
+    warp_tile_n: int = 32
+    warp_tile_k: int = 16
+
+    # How: pipeline traits
+    pipeline: str = "compv4"
+    epilogue: str = "cshuffle"
+    scheduler: str = "intrawave"
+
+    # ConvConfigBase parity fields
+    vector_size_a: int = 4
+    vector_size_b: int = 8
+    vector_size_c: int = 8
+    block_per_cu: int = 1
+    num_wave_groups: int = 1
+    num_groups_to_merge: int = 1
+
+    # Padding (enables arbitrary problem sizes)
+    pad_m: bool = True
+    pad_n: bool = True
+    pad_k: bool = True
+
+    def __post_init__(self):
+        self.variant = _resolve_variant(self.variant)
+        if (
+            self.variant in BACKWARD_VARIANTS
+            and self.pipeline not in BACKWARD_PIPELINES
+        ):
+            self.pipeline = "compv3"
+
+    @property
+    def tile_str(self) -> str:
+        return f"{self.tile_m}x{self.tile_n}x{self.tile_k}"
+
+    @property
+    def wave_str(self) -> str:
+        return f"{self.wave_m}x{self.wave_n}x{self.wave_k}"
+
+    @property
+    def warp_str(self) -> str:
+        return f"{self.warp_tile_m}x{self.warp_tile_n}x{self.warp_tile_k}"
+
+    @property
+    def vec_str(self) -> str:
+        return f"{self.vector_size_a}x{self.vector_size_b}x{self.vector_size_c}"
+
+    @property
+    def name(self) -> str:
+        return (
+            f"grouped_conv_{self.variant}_{self.dtype}_{self.ndim_spatial}d_"
+            f"{self.tile_str}_{self.pipeline}"
+        )
+
+    def to_dict(self) -> dict:
+        """Convert to legacy dict format for codegen compatibility."""
+        return {
+            "tile_config": {
+                "tile_m": [self.tile_m],
+                "tile_n": [self.tile_n],
+                "tile_k": [self.tile_k],
+                "wave_m": [self.wave_m],
+                "wave_n": [self.wave_n],
+                "wave_k": [self.wave_k],
+                "warp_tile_m": [self.warp_tile_m],
+                "warp_tile_n": [self.warp_tile_n],
+                "warp_tile_k": [self.warp_tile_k],
+            },
+            "trait_config": {
+                "pipeline": [self.pipeline],
+                "epilogue": [self.epilogue],
+                "scheduler": [self.scheduler],
+                "pad_m": [self.pad_m],
+                "pad_n": [self.pad_n],
+                "pad_k": [self.pad_k],
+                "vector_size_a": [self.vector_size_a],
+                "vector_size_b": [self.vector_size_b],
+                "vector_size_c": [self.vector_size_c],
+                "block_per_cu": [self.block_per_cu],
+                "num_wave_groups": [self.num_wave_groups],
+                "num_groups_to_merge": [self.num_groups_to_merge],
+            },
+            "variant": self.variant,
+            "ndim_spatial": self.ndim_spatial,
+            "arch": self.arch,
+            "layout": self.layout,
+            "dtype": self.dtype,
+        }
+
+    def to_json_obj(self) -> dict:
+        """Serializable dict for JSON export."""
+        return {
+            "name": self.name,
+            "signature": {
+                "variant": self.variant,
+                "dtype": self.dtype,
+                "ndim_spatial": self.ndim_spatial,
+                "layout": self.layout,
+            },
+            "algorithm": {
+                "tile_m": self.tile_m,
+                "tile_n": self.tile_n,
+                "tile_k": self.tile_k,
+                "wave": self.wave_str,
+                "warp": self.warp_str,
+                "pipeline": self.pipeline,
+                "epilogue": self.epilogue,
+                "scheduler": self.scheduler,
+                "vector_sizes": [
+                    self.vector_size_a,
+                    self.vector_size_b,
+                    self.vector_size_c,
+                ],
+                "block_per_cu": self.block_per_cu,
+                "num_wave_groups": self.num_wave_groups,
+                "num_groups_to_merge": self.num_groups_to_merge,
+            },
+            "arch": self.arch,
+        }
+
+    def print_config(self, indent: str = "  "):
+        print(f"{indent}GroupedConvKernelConfig:")
+        print(f"{indent}  Variant:  {self.variant} {self.ndim_spatial}D")
+        print(f"{indent}  Dtype:    {self.dtype}")
+        print(f"{indent}  Layout:   {self.layout}")
+        print(f"{indent}  Arch:     {self.arch}")
+        print(f"{indent}  Tile:     {self.tile_str}")
+        print(f"{indent}  Wave:     {self.wave_str}")
+        print(f"{indent}  Warp:     {self.warp_str}")
+        print(f"{indent}  Pipeline: {self.pipeline}/{self.scheduler}/{self.epilogue}")
+        print(f"{indent}  VecSizes: {self.vec_str}")
+        print(
+            f"{indent}  BlockCU:  {self.block_per_cu}  WaveGroups: {self.num_wave_groups}  MergeGroups: {self.num_groups_to_merge}"
+        )
+
+
+# =============================================================================
+# GroupedConvProblem
+# =============================================================================
+
+
+@dataclass
+class GroupedConvProblem:
+    """Runtime convolution problem specification.
+
+    Describes the actual sizes of a convolution to be computed.
+    Matches the old ConvProblem from conv_utils.py.
+    """
+
+    N: int = 1
+    C: int = 64
+    K: int = 128
+    G: int = 1
+
+    Hi: int = 28
+    Wi: int = 28
+    Di: int = 1
+
+    Y: int = 3
+    X: int = 3
+    Z: int = 1
+
+    stride_h: int = 1
+    stride_w: int = 1
+    stride_d: int = 1
+
+    pad_h: int = 0
+    pad_w: int = 0
+    pad_d: int = 0
+
+    dilation_h: int = 1
+    dilation_w: int = 1
+    dilation_d: int = 1
+
+    direction: str = "forward"
+    split_k: int = 1
+
+    @property
+    def Ho(self) -> int:
+        eff_y = (self.Y - 1) * self.dilation_h + 1
+        return (self.Hi + 2 * self.pad_h - eff_y) // self.stride_h + 1
+
+    @property
+    def Wo(self) -> int:
+        eff_x = (self.X - 1) * self.dilation_w + 1
+        return (self.Wi + 2 * self.pad_w - eff_x) // self.stride_w + 1
+
+    @property
+    def Do(self) -> int:
+        eff_z = (self.Z - 1) * self.dilation_d + 1
+        return (self.Di + 2 * self.pad_d - eff_z) // self.stride_d + 1
+
+    @property
+    def is_3d(self) -> bool:
+        return self.Di > 1 or self.Z > 1 or self.pad_d > 0
+
+    @property
+    def ndim_spatial(self) -> int:
+        return 3 if self.is_3d else 2
+
+    @property
+    def flops(self) -> float:
+        """Total FLOPs for this convolution (any direction, same count)."""
+        c_per_group = self.C // self.G
+        if self.is_3d:
+            return (
+                2.0
+                * self.N
+                * self.K
+                * self.Do
+                * self.Ho
+                * self.Wo
+                * c_per_group
+                * self.Z
+                * self.Y
+                * self.X
+            )
+        return 2.0 * self.N * self.K * self.Ho * self.Wo * c_per_group * self.Y * self.X
+
+    @property
+    def gflops(self) -> float:
+        return self.flops / 1e9
+
+    def input_shape(self) -> tuple:
+        """NHWGC or NDHWGC layout."""
+        c_per_g = self.C // self.G
+        if self.is_3d:
+            return (self.N, self.Di, self.Hi, self.Wi, self.G, c_per_g)
+        return (self.N, self.Hi, self.Wi, self.G, c_per_g)
+
+    def weight_shape(self) -> tuple:
+        """GKYXC or GKZYXC layout."""
+        c_per_g = self.C // self.G
+        k_per_g = self.K // self.G
+        if self.is_3d:
+            return (self.G, k_per_g, self.Z, self.Y, self.X, c_per_g)
+        return (self.G, k_per_g, self.Y, self.X, c_per_g)
+
+    def output_shape(self) -> tuple:
+        """NHWGK or NDHWGK layout."""
+        k_per_g = self.K // self.G
+        if self.is_3d:
+            return (self.N, self.Do, self.Ho, self.Wo, self.G, k_per_g)
+        return (self.N, self.Ho, self.Wo, self.G, k_per_g)
+
+    def print_problem(self, indent: str = "  "):
+        dim_str = "3D" if self.is_3d else "2D"
+        print(f"{indent}GroupedConvProblem ({dim_str} {self.direction}):")
+        print(f"{indent}  Batch: N={self.N}, G={self.G}")
+        print(f"{indent}  Channels: C={self.C}, K={self.K}")
+        if self.is_3d:
+            print(f"{indent}  Input:  Di={self.Di}, Hi={self.Hi}, Wi={self.Wi}")
+            print(f"{indent}  Filter: Z={self.Z}, Y={self.Y}, X={self.X}")
+            print(f"{indent}  Output: Do={self.Do}, Ho={self.Ho}, Wo={self.Wo}")
+        else:
+            print(f"{indent}  Input:  Hi={self.Hi}, Wi={self.Wi}")
+            print(f"{indent}  Filter: Y={self.Y}, X={self.X}")
+            print(f"{indent}  Output: Ho={self.Ho}, Wo={self.Wo}")
+        print(f"{indent}  GFLOPs: {self.gflops:.2f}")
+
+
+# =============================================================================
+# GroupedConvProblemC (ctypes struct matching C++)
+# =============================================================================
+
+
+class GroupedConvProblemC(ctypes.Structure):
+    """C structure matching ConvProblemC in conv_ctypes_lib.cpp."""
+
+    _fields_ = [
+        ("N", ctypes.c_int),
+        ("G", ctypes.c_int),
+        ("C", ctypes.c_int),
+        ("K", ctypes.c_int),
+        ("input_d", ctypes.c_int),
+        ("input_h", ctypes.c_int),
+        ("input_w", ctypes.c_int),
+        ("filter_z", ctypes.c_int),
+        ("filter_y", ctypes.c_int),
+        ("filter_x", ctypes.c_int),
+        ("stride_d", ctypes.c_int),
+        ("stride_h", ctypes.c_int),
+        ("stride_w", ctypes.c_int),
+        ("pad_d", ctypes.c_int),
+        ("pad_h", ctypes.c_int),
+        ("pad_w", ctypes.c_int),
+        ("dilation_d", ctypes.c_int),
+        ("dilation_h", ctypes.c_int),
+        ("dilation_w", ctypes.c_int),
+        ("direction", ctypes.c_int),
+        ("split_k", ctypes.c_int),
+    ]
+
+    @classmethod
+    def from_problem(cls, p: GroupedConvProblem) -> "GroupedConvProblemC":
+        c = cls()
+        c.N, c.G, c.C, c.K = p.N, p.G, p.C, p.K
+        c.input_d, c.input_h, c.input_w = p.Di, p.Hi, p.Wi
+        c.filter_z, c.filter_y, c.filter_x = p.Z, p.Y, p.X
+        c.stride_d, c.stride_h, c.stride_w = p.stride_d, p.stride_h, p.stride_w
+        c.pad_d, c.pad_h, c.pad_w = p.pad_d, p.pad_h, p.pad_w
+        c.dilation_d, c.dilation_h, c.dilation_w = (
+            p.dilation_d,
+            p.dilation_h,
+            p.dilation_w,
+        )
+        c.direction = DIRECTION_MAP.get(p.direction, 0)
+        c.split_k = getattr(p, "split_k", 1)
+        return c
+
+
+# =============================================================================
+# GroupedConvResult
+# =============================================================================
+
+
+@dataclass
+class GroupedConvResult:
+    """Result of GPU convolution execution."""
+
+    success: bool = False
+    time_ms: float = 0.0
+    tflops: float = 0.0
+    output: Optional[np.ndarray] = None
+    error: str = ""
+
+
+# =============================================================================
+# GroupedConvDispatcherLib
+# =============================================================================
+
+
+class GroupedConvDispatcherLib:
+    """Wrapper for the compiled convolution dispatcher library.
+
+    Provides Python interface to the C API in conv_ctypes_lib.cpp.
+    """
+
+    SEARCH_PATHS = [
+        "build/examples/libdispatcher_conv_lib.so",
+        "build/bindings/libdispatcher_conv_lib.so",
+        "build/lib/libdispatcher_conv_lib.so",
+    ]
+
+    def __init__(self, lib: ctypes.CDLL, path: Path):
+        self._lib = lib
+        self._path = path
+        self._setup_functions()
+
+    def _setup_functions(self):
+        self._lib.conv_dispatcher_init.argtypes = []
+        self._lib.conv_dispatcher_init.restype = ctypes.c_int
+        self._lib.conv_dispatcher_cleanup.argtypes = []
+        self._lib.conv_dispatcher_cleanup.restype = ctypes.c_int
+        self._lib.conv_dispatcher_version.argtypes = []
+        self._lib.conv_dispatcher_version.restype = ctypes.c_char_p
+        self._lib.conv_dispatcher_has_kernels.argtypes = []
+        self._lib.conv_dispatcher_has_kernels.restype = ctypes.c_int
+        self._lib.conv_dispatcher_has_bwd_data.argtypes = []
+        self._lib.conv_dispatcher_has_bwd_data.restype = ctypes.c_int
+        self._lib.conv_dispatcher_has_bwd_weight.argtypes = []
+        self._lib.conv_dispatcher_has_bwd_weight.restype = ctypes.c_int
+        self._lib.conv_dispatcher_get_kernel_count.argtypes = []
+        self._lib.conv_dispatcher_get_kernel_count.restype = ctypes.c_int
+        self._lib.conv_dispatcher_get_kernel_name.argtypes = [
+            ctypes.c_int,
+            ctypes.c_char_p,
+            ctypes.c_int,
+        ]
+        self._lib.conv_dispatcher_get_kernel_name.restype = ctypes.c_int
+        self._lib.conv_dispatcher_is_supported.argtypes = [
+            ctypes.POINTER(GroupedConvProblemC),
+        ]
+        self._lib.conv_dispatcher_is_supported.restype = ctypes.c_int
+        self._lib.conv_dispatcher_run.argtypes = [
+            ctypes.c_void_p,
+            ctypes.c_void_p,
+            ctypes.c_void_p,
+            ctypes.POINTER(GroupedConvProblemC),
+            ctypes.c_void_p,
+        ]
+        self._lib.conv_dispatcher_run.restype = ctypes.c_float
+
+    @classmethod
+    def find(cls) -> Optional["GroupedConvDispatcherLib"]:
+        """Search standard paths for the conv library."""
+        root = Path(__file__).parent.parent
+        for rel in cls.SEARCH_PATHS:
+            path = root / rel
+            if path.exists():
+                try:
+                    lib = ctypes.CDLL(str(path))
+                    return cls(lib, path)
+                except OSError:
+                    continue
+        return None
+
+    @property
+    def path(self) -> Path:
+        return self._path
+
+    def initialize(self):
+        self._lib.conv_dispatcher_init()
+
+    def cleanup(self):
+        self._lib.conv_dispatcher_cleanup()
+
+    def version(self) -> str:
+        return self._lib.conv_dispatcher_version().decode()
+
+    def has_forward(self) -> bool:
+        return self._lib.conv_dispatcher_has_kernels() != 0
+
+    def has_bwd_data(self) -> bool:
+        return self._lib.conv_dispatcher_has_bwd_data() != 0
+
+    def has_bwd_weight(self) -> bool:
+        return self._lib.conv_dispatcher_has_bwd_weight() != 0
+
+    def kernel_count(self) -> int:
+        return self._lib.conv_dispatcher_get_kernel_count()
+
+    def kernel_names(self) -> List[str]:
+        names = []
+        for i in range(self.kernel_count()):
+            buf = ctypes.create_string_buffer(256)
+            if self._lib.conv_dispatcher_get_kernel_name(i, buf, 256) == 0:
+                names.append(buf.value.decode())
+        return names
+
+    def is_supported(self, problem: GroupedConvProblem) -> bool:
+        pc = GroupedConvProblemC.from_problem(problem)
+        return self._lib.conv_dispatcher_is_supported(ctypes.byref(pc)) != 0
+
+    def run(
+        self, a_ptr: int, b_ptr: int, c_ptr: int, problem: GroupedConvProblem
+    ) -> float:
+        """Run convolution. Returns time_ms (>0 success, <0 error)."""
+        pc = GroupedConvProblemC.from_problem(problem)
+        return self._lib.conv_dispatcher_run(
+            a_ptr, b_ptr, c_ptr, ctypes.byref(pc), None
+        )
+
+
+# =============================================================================
+# GpuGroupedConvRunner
+# =============================================================================
+
+
+class GpuGroupedConvRunner:
+    """High-level GPU convolution runner.
+
+    Handles library loading, HIP memory management, and kernel execution.
+    Follows the same pattern as the old GpuConvRunner from conv_utils.py.
+
+    Usage:
+        runner = GpuGroupedConvRunner()
+        if runner.is_available():
+            result = runner.run(input_np, weight_np, problem)
+            print(f"Time: {result.time_ms:.4f} ms, TFLOPS: {result.tflops:.2f}")
+    """
+
+    HIP_MEMCPY_H2D = 1
+    HIP_MEMCPY_D2H = 2
+
+    def __init__(self, lib_path: Optional[str] = None):
+        self._dispatch_lib: Optional[GroupedConvDispatcherLib] = None
+        self._hip = None
+        self._initialized = False
+
+        try:
+            if lib_path:
+                lib = ctypes.CDLL(lib_path)
+                self._dispatch_lib = GroupedConvDispatcherLib(lib, Path(lib_path))
+            else:
+                self._dispatch_lib = GroupedConvDispatcherLib.find()
+
+            if self._dispatch_lib is None:
+                return
+
+            self._hip = ctypes.CDLL("libamdhip64.so")
+            self._hip.hipMalloc.argtypes = [
+                ctypes.POINTER(ctypes.c_void_p),
+                ctypes.c_size_t,
+            ]
+            self._hip.hipMalloc.restype = ctypes.c_int
+            self._hip.hipFree.argtypes = [ctypes.c_void_p]
+            self._hip.hipFree.restype = ctypes.c_int
+            self._hip.hipMemcpy.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_size_t,
+                ctypes.c_int,
+            ]
+            self._hip.hipMemcpy.restype = ctypes.c_int
+            self._hip.hipDeviceSynchronize.argtypes = []
+            self._hip.hipDeviceSynchronize.restype = ctypes.c_int
+
+            self._dispatch_lib.initialize()
+            self._initialized = True
+        except Exception:
+            self._initialized = False
+
+    def is_available(self) -> bool:
+        return self._initialized and self._dispatch_lib is not None
+
+    @property
+    def library_path(self) -> Optional[str]:
+        if self._dispatch_lib:
+            return str(self._dispatch_lib.path)
+        return None
+
+    @property
+    def lib(self) -> Optional[GroupedConvDispatcherLib]:
+        return self._dispatch_lib
+
+    def run(
+        self,
+        input_np: np.ndarray,
+        weight_np: np.ndarray,
+        problem: GroupedConvProblem,
+        output_np: Optional[np.ndarray] = None,
+    ) -> GroupedConvResult:
+        """Run convolution on GPU.
+
+        Args:
+            input_np:  For forward: X (NHWGC). For bwd_data: dY. For bwd_weight: X.
+            weight_np: For forward: W (GKYXC). For bwd_data: W. For bwd_weight: dY.
+            problem:   Problem specification.
+            output_np: Optional pre-allocated output buffer.
+
+        Returns:
+            GroupedConvResult with success, time_ms, tflops, output.
+        """
+        if not self.is_available():
+            return GroupedConvResult(error="GPU not available")
+
+        try:
+            # Determine output shape based on direction
+            d = problem.direction
+            if d == "bwd_data":
+                out_shape = problem.input_shape()
+            elif d == "bwd_weight":
+                out_shape = problem.weight_shape()
+            else:
+                out_shape = problem.output_shape()
+
+            if output_np is None:
+                output_np = np.zeros(out_shape, dtype=input_np.dtype)
+
+            output_size = output_np.nbytes
+
+            # Allocate GPU memory
+            d_a, d_b, d_c = ctypes.c_void_p(), ctypes.c_void_p(), ctypes.c_void_p()
+            self._hip.hipMalloc(ctypes.byref(d_a), input_np.nbytes)
+            self._hip.hipMalloc(ctypes.byref(d_b), weight_np.nbytes)
+            self._hip.hipMalloc(ctypes.byref(d_c), output_size)
+
+            # Host to device
+            self._hip.hipMemcpy(
+                d_a, input_np.ctypes.data, input_np.nbytes, self.HIP_MEMCPY_H2D
+            )
+            self._hip.hipMemcpy(
+                d_b, weight_np.ctypes.data, weight_np.nbytes, self.HIP_MEMCPY_H2D
+            )
+            self._hip.hipDeviceSynchronize()
+
+            # Launch kernel
+            time_ms = self._dispatch_lib.run(d_a.value, d_b.value, d_c.value, problem)
+            self._hip.hipDeviceSynchronize()
+
+            result = GroupedConvResult()
+
+            if time_ms > 0:
+                # Device to host
+                self._hip.hipMemcpy(
+                    output_np.ctypes.data, d_c, output_size, self.HIP_MEMCPY_D2H
+                )
+                self._hip.hipDeviceSynchronize()
+                result.success = True
+                result.time_ms = time_ms
+                result.tflops = problem.flops / (time_ms * 1e9)
+                result.output = output_np
+            else:
+                result.error = (
+                    "unsupported"
+                    if time_ms == -3.0
+                    else "no kernel"
+                    if time_ms == -2.0
+                    else f"error (code {time_ms})"
+                )
+
+            # Free GPU memory
+            self._hip.hipFree(d_a)
+            self._hip.hipFree(d_b)
+            self._hip.hipFree(d_c)
+
+            return result
+
+        except Exception as e:
+            return GroupedConvResult(error=str(e))
+
+    def cleanup(self):
+        if self._dispatch_lib:
+            try:
+                self._dispatch_lib.cleanup()
+            except Exception:
+                pass
+
+
+# =============================================================================
+# GroupedConvRegistry
+# =============================================================================
+
+
+class GroupedConvRegistry:
+    """Collection of grouped conv kernel configs with JSON export/import."""
+
+    def __init__(self, name: str = "default"):
+        self.name = name
+        self._kernels: List[GroupedConvKernelConfig] = []
+
+    def add(self, config: GroupedConvKernelConfig):
+        self._kernels.append(config)
+
+    @property
+    def kernels(self) -> List[GroupedConvKernelConfig]:
+        return list(self._kernels)
+
+    def __len__(self) -> int:
+        return len(self._kernels)
+
+    def select(
+        self, problem: "GroupedConvProblem", heuristic=None
+    ) -> Optional[GroupedConvKernelConfig]:
+        """Select the best kernel for a problem.
+
+        Args:
+            problem:   The convolution problem.
+            heuristic: Optional callable(problem) -> List[str] returning
+                       ranked kernel name substrings.  The registry tries
+                       each in order; falls back to first matching kernel.
+
+        Returns:
+            The best matching GroupedConvKernelConfig, or None.
+        """
+        matching = [k for k in self._kernels if k.variant == problem.direction]
+        if not matching:
+            return None
+
+        if heuristic is not None:
+            ranked = heuristic(problem)
+            for hint in ranked:
+                for k in matching:
+                    if hint in k.name:
+                        return k
+
+        return matching[0] if matching else None
+
+    def filter_by_variant(self, variant: str) -> "GroupedConvRegistry":
+        variant = _resolve_variant(variant)
+        reg = GroupedConvRegistry(f"{self.name}_{variant}")
+        for k in self._kernels:
+            if k.variant == variant:
+                reg.add(k)
+        return reg
+
+    def filter_by_arch(self, arch: str) -> "GroupedConvRegistry":
+        reg = GroupedConvRegistry(f"{self.name}_{arch}")
+        for k in self._kernels:
+            if k.arch == arch:
+                reg.add(k)
+        return reg
+
+    def to_json(self, indent: int = 2) -> str:
+        return json.dumps(
+            {
+                "name": self.name,
+                "kernels": [k.to_json_obj() for k in self._kernels],
+            },
+            indent=indent,
+        )
+
+    @classmethod
+    def from_json(cls, json_str: str) -> "GroupedConvRegistry":
+        data = json.loads(json_str)
+        reg = cls(data.get("name", "imported"))
+        for kd in data.get("kernels", []):
+            sig = kd.get("signature", {})
+            algo = kd.get("algorithm", {})
+            wave = algo.get("wave", "2x2x1").split("x")
+            warp = algo.get("warp", "32x32x16").split("x")
+            vec = algo.get("vector_sizes", [4, 8, 8])
+            reg.add(
+                GroupedConvKernelConfig(
+                    variant=sig.get("variant", "forward"),
+                    ndim_spatial=sig.get("ndim_spatial", 2),
+                    dtype=sig.get("dtype", "fp16"),
+                    layout=sig.get("layout", "nhwgc"),
+                    arch=kd.get("arch", "gfx942"),
+                    tile_m=algo.get("tile_m", 1),
+                    tile_n=algo.get("tile_n", 128),
+                    tile_k=algo.get("tile_k", 128),
+                    wave_m=int(wave[0]),
+                    wave_n=int(wave[1]),
+                    wave_k=int(wave[2]),
+                    warp_tile_m=int(warp[0]),
+                    warp_tile_n=int(warp[1]),
+                    warp_tile_k=int(warp[2]),
+                    pipeline=algo.get("pipeline", "compv3"),
+                    epilogue=algo.get("epilogue", "cshuffle"),
+                    scheduler=algo.get("scheduler", "intrawave"),
+                    vector_size_a=vec[0] if len(vec) > 0 else 4,
+                    vector_size_b=vec[1] if len(vec) > 1 else 8,
+                    vector_size_c=vec[2] if len(vec) > 2 else 8,
+                    block_per_cu=algo.get("block_per_cu", 1),
+                    num_wave_groups=algo.get("num_wave_groups", 1),
+                    num_groups_to_merge=algo.get("num_groups_to_merge", 1),
+                )
+            )
+        return reg
+
+    def build(
+        self,
+        verbose: bool = False,
+        max_workers: Optional[int] = None,
+    ) -> Dict[Tuple[str, int], "GpuGroupedConvRunner"]:
+        """Parallel JIT compile all kernels in this registry.
+
+        Args:
+            verbose:     Print progress during build.
+            max_workers: Max parallel codegen/compile processes (default: cpu_count capped at 8).
+
+        Returns a dict mapping (variant, ndim_spatial) to a ready-to-use
+        GpuGroupedConvRunner.
+        """
+        if not self._kernels:
+            return {}
+
+        libs = setup_multiple_grouped_conv_dispatchers(
+            self._kernels,
+            verbose=verbose,
+            max_workers=max_workers,
+        )
+
+        runners: Dict[Tuple[str, int], GpuGroupedConvRunner] = {}
+        for cfg, lib in zip(self._kernels, libs):
+            if lib is None:
+                continue
+            key = (cfg.variant, cfg.ndim_spatial)
+            if key in runners:
+                continue
+            runner = GpuGroupedConvRunner(lib_path=str(lib.path))
+            if runner.is_available():
+                runners[key] = runner
+        return runners
+
+    def print_registry(self, indent: str = "  "):
+        print(f"{indent}Registry '{self.name}': {len(self)} kernels")
+        for i, k in enumerate(self._kernels):
+            print(
+                f"{indent}  [{i}] {k.name} (valid={validate_grouped_conv_config(k.to_dict()).is_valid})"
+            )
+
+
+# =============================================================================
+# GroupedConvValidationResult
+# =============================================================================
+
+
+@dataclass
+class GroupedConvValidationResult(ValidationResultBase):
+    """Result of grouped conv kernel config validation."""
+
+    variant: str = "forward"
+
+    def __init__(
+        self,
+        is_valid=True,
+        errors=None,
+        warnings=None,
+        suggested_fixes=None,
+        variant="forward",
+    ):
+        super().__init__(
+            is_valid=is_valid,
+            errors=errors or [],
+            warnings=warnings or [],
+            suggested_fixes=suggested_fixes or {},
+        )
+        self.variant = variant
+
+
+# =============================================================================
+# Validation helpers (extracted from the original config extraction code)
+# =============================================================================
+
+
+def _first(val):
+    if isinstance(val, list) and len(val) > 0:
+        return val[0]
+    return val
+
+
+def _get_tile_config(config: dict) -> dict:
+    return config.get("tile_config") or {}
+
+
+def _get_trait_config(config: dict) -> dict:
+    return config.get("trait_config") or {}
+
+
+def _extract_wave_config(tile_config: dict) -> List[int]:
+    wm = tile_config.get("wave_m") or tile_config.get("warp_m")
+    wn = tile_config.get("wave_n") or tile_config.get("warp_n")
+    wk = tile_config.get("wave_k") or tile_config.get("warp_k")
+    if wm is not None and wn is not None and wk is not None:
+        return [_first(wm), _first(wn), _first(wk)]
+    return [2, 2, 1]
+
+
+def _extract_warp_tile_config(tile_config: dict) -> List[int]:
+    wtm = tile_config.get("warp_tile_m") or tile_config.get("warp_m")
+    wtn = tile_config.get("warp_tile_n") or tile_config.get("warp_n")
+    wtk = tile_config.get("warp_tile_k") or tile_config.get("warp_k")
+    if wtm is not None and wtn is not None and wtk is not None:
+        return [_first(wtm), _first(wtn), _first(wtk)]
+    return [32, 32, 16]
+
+
+def _extract_trait_values(trait_config: dict) -> Tuple[str, str, str]:
+    p = _first(trait_config.get("pipeline", "compv4"))
+    e = _first(trait_config.get("epilogue", "cshuffle"))
+    s = _first(trait_config.get("scheduler", "intrawave"))
+    if isinstance(p, list):
+        p = p[0] if p else "compv4"
+    if isinstance(e, list):
+        e = e[0] if e else "cshuffle"
+    if isinstance(s, list):
+        s = s[0] if s else "intrawave"
+    return (str(p), str(e), str(s))
+
+
+# =============================================================================
+# validate_grouped_conv_config / auto_correct_grouped_conv_config
+# =============================================================================
+
+
+def validate_grouped_conv_config(config: dict) -> GroupedConvValidationResult:
+    """Validate a grouped conv kernel config dict.
+
+    Accepts either a raw dict (legacy) or GroupedConvKernelConfig.to_dict() output.
+    """
+    errors: List[str] = []
+    warnings: List[str] = []
+    suggested_fixes: Dict[str, Any] = {}
+
+    required = (
+        "tile_config",
+        "trait_config",
+        "variant",
+        "ndim_spatial",
+        "arch",
+        "layout",
+    )
+    for key in required:
+        if key not in config:
+            errors.append(f"Missing required key: {key}")
+    if errors:
+        return GroupedConvValidationResult(
+            is_valid=False,
+            errors=errors,
+            warnings=warnings,
+            suggested_fixes=suggested_fixes,
+            variant=config.get("variant", "forward"),
+        )
+
+    tile_config = _get_tile_config(config)
+    trait_config = _get_trait_config(config)
+    variant = _first(config.get("variant", "forward"))
+    if isinstance(variant, list):
+        variant = variant[0] if variant else "forward"
+    variant = _resolve_variant(str(variant))
+
+    ndim_spatial = config.get("ndim_spatial")
+    arch = config.get("arch", "gfx942")
+    dtype = config.get("dtype", "fp16")
+
+    if variant not in VALID_VARIANTS:
+        errors.append(f"Invalid variant: {variant}. Valid: {', '.join(VALID_VARIANTS)}")
+        suggested_fixes["variant"] = "forward"
+
+    if ndim_spatial is not None:
+        ndim = ndim_spatial
+        if isinstance(ndim, list):
+            ndim = ndim[0] if ndim else 2
+        if ndim not in VALID_NDIM_SPATIAL:
+            errors.append(
+                f"Invalid ndim_spatial: {ndim}. Valid: {', '.join(map(str, VALID_NDIM_SPATIAL))}"
+            )
+            suggested_fixes["ndim_spatial"] = 2
+
+    pipeline, epilogue, scheduler = _extract_trait_values(trait_config)
+    if variant in BACKWARD_VARIANTS and pipeline not in BACKWARD_PIPELINES:
+        errors.append(
+            f"Backward variant '{variant}' requires pipeline compv3 or mem, got {pipeline}"
+        )
+        suggested_fixes["pipeline"] = "compv3"
+
+    ok, msg = validate_trait_combo(pipeline, epilogue, scheduler)
+    if not ok:
+        errors.append(msg)
+        suggested_fixes["scheduler"] = "intrawave"
+
+    wave_cfg = _extract_wave_config(tile_config)
+    ok, msg = validate_wave_config(wave_cfg, arch)
+    if not ok:
+        errors.append(msg)
+        arch_data = get_arch_filter_data()
+        valid_waves = arch_data["warp_combos"].get(arch, [[2, 2, 1]])
+        if valid_waves:
+            suggested_fixes["wave_m"] = valid_waves[0][0]
+            suggested_fixes["wave_n"] = valid_waves[0][1]
+            suggested_fixes["wave_k"] = valid_waves[0][2]
+
+    warp_cfg = _extract_warp_tile_config(tile_config)
+    ok, msg = validate_warp_tile_config(warp_cfg, arch, dtype)
+    if not ok:
+        errors.append(msg)
+        arch_data = get_arch_filter_data()
+        acc = "int32" if dtype == "int8" else "fp32"
+        dtype_key = f"{dtype}_{dtype}_{acc}"
+        valid_tiles = (
+            arch_data["warp_tile_combos"]
+            .get(arch, {})
+            .get(dtype_key, [[32, 32, 16], [16, 16, 16]])
+        )
+        if valid_tiles:
+            suggested_fixes["warp_tile_m"] = valid_tiles[0][0]
+            suggested_fixes["warp_tile_n"] = valid_tiles[0][1]
+            suggested_fixes["warp_tile_k"] = valid_tiles[0][2]
+
+    arch_data = get_arch_filter_data()
+    if arch not in arch_data["supported_archs"]:
+        errors.append(
+            f"Unsupported architecture: {arch}. Supported: {', '.join(arch_data['supported_archs'])}"
+        )
+
+    return GroupedConvValidationResult(
+        is_valid=len(errors) == 0,
+        errors=errors,
+        warnings=warnings,
+        suggested_fixes=suggested_fixes,
+        variant=variant,
+    )
+
+
+def auto_correct_grouped_conv_config(
+    config: dict,
+) -> Tuple[dict, GroupedConvValidationResult]:
+    """Auto-correct invalid grouped conv config. Returns (corrected, result)."""
+    result = validate_grouped_conv_config(config)
+    corrected = copy.deepcopy(config)
+
+    if result.is_valid:
+        return corrected, result
+
+    tile_config = corrected.setdefault("tile_config", {})
+    trait_config = corrected.setdefault("trait_config", {})
+
+    wave_cfg = _extract_wave_config(tile_config)
+    arch = config.get("arch", "gfx942")
+    fixed_wave = auto_correct_wave(wave_cfg, arch)
+    tile_config["wave_m"] = fixed_wave[0]
+    tile_config["wave_n"] = fixed_wave[1]
+    tile_config["wave_k"] = fixed_wave[2]
+
+    pipeline, epilogue, scheduler = _extract_trait_values(trait_config)
+    fixed_pipeline, fixed_scheduler = auto_correct_trait(pipeline, scheduler)
+    trait_config["pipeline"] = fixed_pipeline
+    trait_config["scheduler"] = fixed_scheduler
+
+    variant = _first(config.get("variant", "forward"))
+    if isinstance(variant, list):
+        variant = variant[0] if variant else "forward"
+    variant = _resolve_variant(str(variant))
+    if variant in BACKWARD_VARIANTS and fixed_pipeline not in BACKWARD_PIPELINES:
+        trait_config["pipeline"] = "compv3"
+
+    if "warp_tile_m" in result.suggested_fixes:
+        tile_config["warp_tile_m"] = result.suggested_fixes["warp_tile_m"]
+        tile_config["warp_tile_n"] = result.suggested_fixes["warp_tile_n"]
+        tile_config["warp_tile_k"] = result.suggested_fixes["warp_tile_k"]
+
+    result = validate_grouped_conv_config(corrected)
+    return corrected, result
+
+
+def _run_hipcc_subprocess(args: dict) -> Tuple[bool, Optional[Path], str]:
+    """Run one hipcc compile+link job in a subprocess worker."""
+    import subprocess
+    from pathlib import Path
+
+    compile_cmd = args["compile_cmd"]
+    link_cmd = args["link_cmd"]
+    lib_path = Path(args["lib_path"])
+
+    try:
+        res_c = subprocess.run(compile_cmd, capture_output=True, text=True, timeout=300)
+        if res_c.returncode != 0:
+            return False, None, f"Compile failed: {res_c.stderr[:400]}"
+
+        res_l = subprocess.run(link_cmd, capture_output=True, text=True, timeout=300)
+        if res_l.returncode != 0:
+            return False, None, f"Link failed: {res_l.stderr[:400]}"
+
+        return True, lib_path, ""
+    except subprocess.TimeoutExpired:
+        return False, None, "Timeout"
+    except Exception as e:
+        return False, None, f"Error: {e}"
+
+
+def _run_conv_codegen_subprocess(args: dict) -> Tuple[bool, Optional[str], str]:
+    """Run grouped-conv codegen once and return generated kernel header path."""
+    import subprocess
+    from pathlib import Path
+
+    out_dir = Path(args["output_dir"])
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Remove stale kernels so header discovery is exact for this invocation.
+    for stale in out_dir.glob("grouped_conv_*.hpp"):
+        stale.unlink(missing_ok=True)
+    for stale in out_dir.glob("include_all_grouped_conv_*.hpp"):
+        stale.unlink(missing_ok=True)
+
+    try:
+        res = subprocess.run(args["cmd"], capture_output=True, text=True, timeout=300)
+        if res.returncode != 0:
+            err = (res.stderr or res.stdout or "").strip()[:500]
+            return False, None, f"Codegen failed: {err}"
+
+        generated = sorted(
+            out_dir.glob("grouped_conv_*.hpp"),
+            key=lambda p: p.stat().st_mtime,
+            reverse=True,
+        )
+        if not generated:
+            return False, None, "Codegen produced no grouped_conv_*.hpp header"
+
+        return True, str(generated[0]), ""
+    except subprocess.TimeoutExpired:
+        return False, None, "Codegen timed out"
+    except Exception as e:
+        return False, None, f"Codegen error: {e}"
+
+
+def _config_key(c: GroupedConvKernelConfig) -> Tuple[Any, ...]:
+    return (
+        c.variant,
+        c.ndim_spatial,
+        c.dtype,
+        c.layout,
+        c.arch,
+        c.tile_m,
+        c.tile_n,
+        c.tile_k,
+        c.wave_m,
+        c.wave_n,
+        c.wave_k,
+        c.warp_tile_m,
+        c.warp_tile_n,
+        c.warp_tile_k,
+        c.pipeline,
+        c.epilogue,
+        c.scheduler,
+    )
+
+
+def _parse_triplet(value: str) -> Tuple[int, int, int]:
+    parts = value.split("x")
+    if len(parts) != 3:
+        raise ValueError(f"Invalid triplet: {value}")
+    return int(parts[0]), int(parts[1]), int(parts[2])
+
+
+def _list_arch_valid_grouped_conv_configs(
+    codegen_script: Path,
+    arch: str,
+    dtype: str,
+    variant: str,
+    ndim_spatial: int,
+) -> List[GroupedConvKernelConfig]:
+    """Query codegen defaults for this (arch, dtype, variant, ndim) tuple."""
+    import re
+    import sys
+
+    cmd = [
+        sys.executable,
+        str(codegen_script),
+        "--list-configs",
+        "--arch",
+        arch,
+        "--datatype",
+        dtype,
+        "--variant",
+        variant,
+        "--ndim",
+        str(ndim_spatial),
+    ]
+    res = subprocess.run(cmd, capture_output=True, text=True, timeout=180)
+    if res.returncode != 0:
+        return []
+
+    # Example:
+    # grouped_conv_fwd_fp16_nhwgc_2d_compv3_cshuffle_intrawave_128x128x32_2x2x1_32x32x16
+    name_re = re.compile(
+        r"^grouped_conv_(fwd|bwd_data|bwd_weight|bwdd|bwdw)_([a-z0-9]+)_([a-z0-9]+)_([123])d_"
+        r"([a-z0-9]+)_([a-z0-9]+)_([a-z0-9]+)_"
+        r"([0-9]+x[0-9]+x[0-9]+)_([0-9]+x[0-9]+x[0-9]+)_([0-9]+x[0-9]+x[0-9]+)"
+        r"(?:_.*)?$"
+    )
+    short_to_variant = {
+        "fwd": "forward",
+        "bwd_data": "bwd_data",
+        "bwd_weight": "bwd_weight",
+        "bwdd": "bwd_data",
+        "bwdw": "bwd_weight",
+    }
+
+    out: List[GroupedConvKernelConfig] = []
+    seen = set()
+    for raw in res.stdout.splitlines():
+        line = raw.strip()
+        if not line.startswith("- grouped_conv_"):
+            continue
+        name = line[2:].strip()
+        m = name_re.match(name)
+        if not m:
+            continue
+
+        v_short, dt, layout, ndim, pipe, epi, sched, tile_s, wave_s, warp_s = m.groups()
+        tm, tn, tk = _parse_triplet(tile_s)
+        wm, wn, wk = _parse_triplet(wave_s)
+        wtm, wtn, wtk = _parse_triplet(warp_s)
+
+        cfg = GroupedConvKernelConfig(
+            variant=short_to_variant[v_short],
+            ndim_spatial=int(ndim),
+            dtype=dt,
+            layout=layout,
+            arch=arch,
+            tile_m=tm,
+            tile_n=tn,
+            tile_k=tk,
+            wave_m=wm,
+            wave_n=wn,
+            wave_k=wk,
+            warp_tile_m=wtm,
+            warp_tile_n=wtn,
+            warp_tile_k=wtk,
+            pipeline=pipe,
+            epilogue=epi,
+            scheduler=sched,
+        )
+        key = _config_key(cfg)
+        if key not in seen:
+            out.append(cfg)
+            seen.add(key)
+
+    return out
+
+
+def _select_best_arch_valid_conv_config(
+    requested: GroupedConvKernelConfig,
+    candidates: List[GroupedConvKernelConfig],
+) -> GroupedConvKernelConfig:
+    """Pick nearest arch-valid config while preferring trait exact matches."""
+
+    def score(c: GroupedConvKernelConfig) -> Tuple[int, int, int, int, int, int]:
+        tile_delta = (
+            abs(c.tile_m - requested.tile_m)
+            + abs(c.tile_n - requested.tile_n)
+            + abs(c.tile_k - requested.tile_k)
+        )
+        wave_delta = (
+            abs(c.wave_m - requested.wave_m)
+            + abs(c.wave_n - requested.wave_n)
+            + abs(c.wave_k - requested.wave_k)
+        )
+        warp_tile_delta = (
+            abs(c.warp_tile_m - requested.warp_tile_m)
+            + abs(c.warp_tile_n - requested.warp_tile_n)
+            + abs(c.warp_tile_k - requested.warp_tile_k)
+        )
+        return (
+            0 if c.pipeline == requested.pipeline else 1,
+            0 if c.scheduler == requested.scheduler else 1,
+            0 if c.epilogue == requested.epilogue else 1,
+            tile_delta,
+            wave_delta,
+            warp_tile_delta,
+        )
+
+    best = min(candidates, key=score)
+    selected = copy.deepcopy(best)
+    selected.arch = requested.arch
+    return selected
+
+
+def _write_single_conv_dispatch_header(
+    config: GroupedConvKernelConfig,
+    kernel_header: Path,
+    dispatch_header: Path,
+) -> None:
+    """Create a tiny dispatch header consumed by conv_ctypes_lib.cpp."""
+    macros: List[str] = []
+    aliases: List[str] = []
+
+    if config.variant == "forward":
+        kernel_name_symbol = "CONV_FWD_KERNEL_NAME"
+        if config.ndim_spatial == 3:
+            macros.append("#define CONV_FWD_3D_AVAILABLE 1")
+            aliases.append("using ConvFwd3dLauncher = SelectedConvKernelLauncher;")
+        else:
+            macros.append("#define CONV_FWD_2D_AVAILABLE 1")
+    elif config.variant == "bwd_data":
+        kernel_name_symbol = "CONV_BWD_DATA_KERNEL_NAME"
+        if config.ndim_spatial == 3:
+            macros.append("#define CONV_BWD_DATA_3D_AVAILABLE 1")
+            aliases.append("using ConvBwdData3dLauncher = SelectedConvBwdDataLauncher;")
+        else:
+            macros.append("#define CONV_BWD_DATA_2D_AVAILABLE 1")
+    else:
+        kernel_name_symbol = "CONV_BWD_WEIGHT_KERNEL_NAME"
+        if config.ndim_spatial == 3:
+            macros.append("#define CONV_BWD_WEIGHT_3D_AVAILABLE 1")
+            aliases.append(
+                "using ConvBwdWeight3dLauncher = SelectedConvBwdWeightLauncher;"
+            )
+        else:
+            macros.append("#define CONV_BWD_WEIGHT_2D_AVAILABLE 1")
+
+    content = (
+        "// Auto-generated single-kernel dispatch header for Python JIT\n"
+        "#pragma once\n\n"
+        f'#include "{kernel_header.name}"\n\n'
+        + "\n".join(macros)
+        + "\n\n"
+        + "\n".join(aliases)
+        + "\n\n"
+        + f"static const char* CONV_KERNEL_NAMES[] = {{{kernel_name_symbol}}};\n"
+        + "static constexpr int CONV_KERNEL_COUNT = 1;\n"
+    )
+    dispatch_header.write_text(content)
+
+
+class GroupedConvCodegenRunner:
+    """Generate and compile grouped-conv JIT libraries in parallel."""
+
+    def __init__(self, max_workers: Optional[int] = None):
+        import multiprocessing
+
+        self.max_workers = max_workers or min(multiprocessing.cpu_count(), 8)
+        self.root = Path(__file__).parent.parent
+        self.build_dir = self.root / "build"
+        self.codegen_script = self.root / "codegen" / "unified_grouped_conv_codegen.py"
+
+    def generate_and_compile_parallel(
+        self,
+        configs: List[GroupedConvKernelConfig],
+        verbose: bool = True,
+    ) -> List[Optional[Path]]:
+        import sys
+        from concurrent.futures import ProcessPoolExecutor, as_completed
+
+        if not configs:
+            return []
+
+        if not self.build_dir.exists():
+            self.build_dir.mkdir(parents=True, exist_ok=True)
+
+        ctypes_source = self.root / "bindings" / "ctypes" / "conv_ctypes_lib.cpp"
+        static_lib = self.build_dir / "libck_tile_dispatcher.a"
+        jit_root = self.build_dir / "generated_kernels" / "python_jit"
+        jit_root.mkdir(parents=True, exist_ok=True)
+        (self.build_dir / "examples").mkdir(parents=True, exist_ok=True)
+
+        if not self.codegen_script.exists():
+            if verbose:
+                print(f"Codegen script missing: {self.codegen_script}")
+            return [None] * len(configs)
+        if not ctypes_source.exists() or not static_lib.exists():
+            if verbose:
+                print("Missing conv ctypes source or static dispatcher library")
+            return [None] * len(configs)
+
+        if verbose:
+            print(
+                f"Generating {len(configs)} grouped-conv kernels in parallel "
+                f"(workers={self.max_workers})..."
+            )
+
+        gen_jobs: List[Dict[str, Any]] = []
+        job_dirs: List[Path] = []
+        for i, c in enumerate(configs):
+            cfg_dir = jit_root / f"cfg_{i}"
+            cfg_dir.mkdir(parents=True, exist_ok=True)
+            job_dirs.append(cfg_dir)
+
+            cmd = [
+                sys.executable,
+                str(self.codegen_script),
+                "--output",
+                str(cfg_dir),
+                "--datatype",
+                c.dtype,
+                "--variant",
+                c.variant,
+                "--ndim",
+                str(c.ndim_spatial),
+                "--arch",
+                c.arch,
+                "--tile-m",
+                str(c.tile_m),
+                "--tile-n",
+                str(c.tile_n),
+                "--tile-k",
+                str(c.tile_k),
+                "--warp-m",
+                str(c.wave_m),
+                "--warp-n",
+                str(c.wave_n),
+                "--warp-k",
+                str(c.wave_k),
+                "--warp-tile-m",
+                str(c.warp_tile_m),
+                "--warp-tile-n",
+                str(c.warp_tile_n),
+                "--warp-tile-k",
+                str(c.warp_tile_k),
+                "--pipeline",
+                c.pipeline,
+                "--scheduler",
+                c.scheduler,
+                "--epilogue",
+                c.epilogue,
+            ]
+            gen_jobs.append({"cmd": cmd, "output_dir": str(cfg_dir)})
+
+        generated_headers: List[Optional[Path]] = [None] * len(configs)
+        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = {
+                executor.submit(_run_conv_codegen_subprocess, job): idx
+                for idx, job in enumerate(gen_jobs)
+            }
+            for future in as_completed(futures):
+                idx = futures[future]
+                ok, header_path, err = future.result()
+                if ok and header_path:
+                    generated_headers[idx] = Path(header_path)
+                    if verbose:
+                        print(f"  OK [{idx}] codegen: {Path(header_path).name}")
+                else:
+                    if verbose:
+                        print(f"  FAIL [{idx}] codegen: {err}")
+
+        if verbose:
+            compile_count = sum(1 for h in generated_headers if h is not None)
+            print(
+                f"Compiling {compile_count} grouped-conv libraries in parallel "
+                f"(workers={self.max_workers})..."
+            )
+
+        compile_jobs: List[Dict[str, Any]] = []
+        compile_to_input_index: Dict[int, int] = {}
+        for i, c in enumerate(configs):
+            hdr_path = generated_headers[i]
+            if hdr_path is None:
+                continue
+
+            cfg_dir = job_dirs[i]
+            dispatch_header = cfg_dir / "conv_python_dispatch.hpp"
+            _write_single_conv_dispatch_header(c, hdr_path, dispatch_header)
+
+            lib_name = (
+                f"libdispatcher_conv_{c.variant}_{c.ndim_spatial}d_{c.dtype}_"
+                f"{c.tile_str}_{c.wave_str}_{c.warp_str}_{c.pipeline}_{c.scheduler}.so"
+            )
+            lib_path = self.build_dir / "examples" / lib_name
+            obj_file = lib_path.with_suffix(".o")
+
+            compile_cmd = [
+                "/opt/rocm/bin/hipcc",
+                "-c",
+                "-fPIC",
+                "-O3",
+                f"-I{self.root / 'include'}",
+                f"-I{self.root.parent / 'include'}",
+                f"-I{self.root.parent}",
+                f"-I{cfg_dir}",
+                "-DCK_TILE_SINGLE_KERNEL_INCLUDE",
+                f"-include{dispatch_header}",
+                "-D__HIP_PLATFORM_AMD__",
+                f"--offload-arch={c.arch}",
+                f'-DGFX_ARCH="{c.arch}"',
+                "-mllvm",
+                "-enable-noalias-to-md-conversion=0",
+                "-Wno-undefined-func-template",
+                "-Wno-float-equal",
+                str(ctypes_source),
+                "-o",
+                str(obj_file),
+            ]
+            link_cmd = [
+                "/opt/rocm/bin/hipcc",
+                "-shared",
+                "-fPIC",
+                f"--offload-arch={c.arch}",
+                "--hip-link",
+                str(obj_file),
+                str(static_lib),
+                "-o",
+                str(lib_path),
+            ]
+
+            compile_to_input_index[len(compile_jobs)] = i
+            compile_jobs.append(
+                {
+                    "compile_cmd": compile_cmd,
+                    "link_cmd": link_cmd,
+                    "lib_path": str(lib_path),
+                    "config_name": c.name,
+                }
+            )
+
+        results_map: Dict[int, Optional[Path]] = {i: None for i in range(len(configs))}
+        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = {
+                executor.submit(_run_hipcc_subprocess, job): j
+                for j, job in enumerate(compile_jobs)
+            }
+            for future in as_completed(futures):
+                job_idx = futures[future]
+                idx = compile_to_input_index[job_idx]
+                success, lib_path, err = future.result()
+                if success and lib_path:
+                    results_map[idx] = Path(lib_path)
+                if verbose:
+                    status = "OK" if success else f"FAIL ({err})"
+                    name = (
+                        Path(lib_path).name
+                        if success and lib_path
+                        else compile_jobs[job_idx]["config_name"]
+                    )
+                    print(f"  {status} {name}")
+
+        return [results_map.get(i) for i in range(len(configs))]
+
+
+# =============================================================================
+# Convenience functions
+# =============================================================================
+
+
+def get_grouped_conv_default_config(
+    variant: str = "forward",
+    ndim_spatial: int = 2,
+    arch: str = "gfx942",
+    dtype: str = "fp16",
+) -> GroupedConvKernelConfig:
+    """Return a valid default GroupedConvKernelConfig."""
+    return GroupedConvKernelConfig(
+        variant=variant,
+        ndim_spatial=ndim_spatial,
+        arch=arch,
+        dtype=dtype,
+    )
+
+
+def format_grouped_conv_summary(config) -> str:
+    """Format a config (dict or GroupedConvKernelConfig) into a human-readable string."""
+    if isinstance(config, GroupedConvKernelConfig):
+        lines = [
+            f"Grouped Conv Config: {config.variant} {config.ndim_spatial}D",
+            f"  Arch:    {config.arch}",
+            f"  Layout:  {config.layout}",
+            f"  Dtype:   {config.dtype}",
+            f"  Tile:    {config.tile_str}",
+            f"  Wave:    {config.wave_str}",
+            f"  Warp:    {config.warp_str}",
+            f"  Traits:  pipeline={config.pipeline} epilogue={config.epilogue} scheduler={config.scheduler}",
+        ]
+        return "\n".join(lines)
+
+    # Legacy dict support
+    tile_config = _get_tile_config(config) if isinstance(config, dict) else {}
+    trait_config = _get_trait_config(config) if isinstance(config, dict) else {}
+    variant = config.get("variant", "?") if isinstance(config, dict) else "?"
+    ndim = config.get("ndim_spatial", "?") if isinstance(config, dict) else "?"
+    arch = config.get("arch", "?") if isinstance(config, dict) else "?"
+    layout = config.get("layout", "?") if isinstance(config, dict) else "?"
+    dtype = config.get("dtype", "fp16") if isinstance(config, dict) else "fp16"
+
+    lines = [f"Grouped Conv Config: {variant} {ndim}D"]
+    lines.append(f"  Arch:    {arch}")
+    lines.append(f"  Layout:  {layout}")
+    lines.append(f"  Dtype:   {dtype}")
+
+    if tile_config:
+        wave = _extract_wave_config(tile_config)
+        warp = _extract_warp_tile_config(tile_config)
+        lines.append(
+            f"  Tile:    M={_first(tile_config.get('tile_m', 1))} N={_first(tile_config.get('tile_n', 128))} K={_first(tile_config.get('tile_k', 128))}"
+        )
+        lines.append(f"  Wave:    {wave[0]}x{wave[1]}x{wave[2]}")
+        lines.append(f"  Warp:    {warp[0]}x{warp[1]}x{warp[2]}")
+
+    if trait_config:
+        pipeline = _first(trait_config.get("pipeline", "?"))
+        epilogue = _first(trait_config.get("epilogue", "?"))
+        scheduler = _first(trait_config.get("scheduler", "?"))
+        lines.append(
+            f"  Traits:  pipeline={pipeline} epilogue={epilogue} scheduler={scheduler}"
+        )
+
+    return "\n".join(lines) if lines else "(empty config)"
+
+
+def setup_multiple_grouped_conv_dispatchers(
+    configs: List[GroupedConvKernelConfig],
+    verbose: bool = True,
+    max_workers: Optional[int] = None,
+) -> List[Optional[GroupedConvDispatcherLib]]:
+    """
+    Setup multiple grouped-conv dispatchers in parallel.
+
+    This keeps architecture filtering strict:
+      1. Validate + auto-correct each requested config
+      2. Query codegen's arch-valid config set for each (arch, dtype, variant, ndim)
+      3. Map each request to nearest valid config
+      4. Parallel codegen + parallel compile
+    """
+    if not configs:
+        return []
+
+    codegen_script = (
+        Path(__file__).parent.parent / "codegen" / "unified_grouped_conv_codegen.py"
+    )
+    arch_valid_cache: Dict[
+        Tuple[str, str, str, int], List[GroupedConvKernelConfig]
+    ] = {}
+
+    selected_configs: List[Optional[GroupedConvKernelConfig]] = []
+    for i, original in enumerate(configs):
+        c = copy.deepcopy(original)
+
+        val = validate_grouped_conv_config(c.to_dict())
+        if not val.is_valid:
+            corrected, corrected_result = auto_correct_grouped_conv_config(c.to_dict())
+            if not corrected_result.is_valid:
+                if verbose:
+                    print(f"  FAIL [{i}] config remains invalid after auto-correct")
+                selected_configs.append(None)
+                continue
+
+            tile_cfg = corrected.get("tile_config", {})
+            trait_cfg = corrected.get("trait_config", {})
+            c.variant = _resolve_variant(
+                str(_first(corrected.get("variant", c.variant)))
+            )
+            c.ndim_spatial = int(_first(corrected.get("ndim_spatial", c.ndim_spatial)))
+            c.arch = str(corrected.get("arch", c.arch))
+            c.layout = str(corrected.get("layout", c.layout))
+            c.dtype = str(corrected.get("dtype", c.dtype))
+            c.tile_m = int(_first(tile_cfg.get("tile_m", c.tile_m)))
+            c.tile_n = int(_first(tile_cfg.get("tile_n", c.tile_n)))
+            c.tile_k = int(_first(tile_cfg.get("tile_k", c.tile_k)))
+            c.wave_m = int(_first(tile_cfg.get("wave_m", c.wave_m)))
+            c.wave_n = int(_first(tile_cfg.get("wave_n", c.wave_n)))
+            c.wave_k = int(_first(tile_cfg.get("wave_k", c.wave_k)))
+            c.warp_tile_m = int(_first(tile_cfg.get("warp_tile_m", c.warp_tile_m)))
+            c.warp_tile_n = int(_first(tile_cfg.get("warp_tile_n", c.warp_tile_n)))
+            c.warp_tile_k = int(_first(tile_cfg.get("warp_tile_k", c.warp_tile_k)))
+            c.pipeline = str(_first(trait_cfg.get("pipeline", c.pipeline)))
+            c.scheduler = str(_first(trait_cfg.get("scheduler", c.scheduler)))
+            c.epilogue = str(_first(trait_cfg.get("epilogue", c.epilogue)))
+
+        cache_key = (c.arch, c.dtype, c.variant, c.ndim_spatial)
+        if cache_key not in arch_valid_cache:
+            arch_valid_cache[cache_key] = _list_arch_valid_grouped_conv_configs(
+                codegen_script=codegen_script,
+                arch=c.arch,
+                dtype=c.dtype,
+                variant=c.variant,
+                ndim_spatial=c.ndim_spatial,
+            )
+            if verbose and not arch_valid_cache[cache_key]:
+                print(
+                    f"  FAIL [{i}] no arch-valid configs listed for "
+                    f"{c.arch}/{c.dtype}/{c.variant}/{c.ndim_spatial}d"
+                )
+
+        candidates = arch_valid_cache[cache_key]
+        if not candidates:
+            selected_configs.append(None)
+            continue
+
+        selected = _select_best_arch_valid_conv_config(c, candidates)
+        if verbose and _config_key(selected) != _config_key(c):
+            print(
+                f"  INFO [{i}] mapped to arch-valid config: "
+                f"{selected.tile_str} {selected.wave_str} {selected.warp_str} "
+                f"{selected.pipeline}/{selected.scheduler}/{selected.epilogue}"
+            )
+        selected_configs.append(selected)
+
+    unique_configs: List[GroupedConvKernelConfig] = []
+    unique_index_by_key: Dict[Tuple[Any, ...], int] = {}
+    input_to_unique: List[Optional[int]] = []
+    for cfg in selected_configs:
+        if cfg is None:
+            input_to_unique.append(None)
+            continue
+        key = _config_key(cfg)
+        if key not in unique_index_by_key:
+            unique_index_by_key[key] = len(unique_configs)
+            unique_configs.append(cfg)
+        input_to_unique.append(unique_index_by_key[key])
+
+    runner = GroupedConvCodegenRunner(max_workers=max_workers)
+    unique_lib_paths = runner.generate_and_compile_parallel(
+        unique_configs, verbose=verbose
+    )
+
+    libs: List[Optional[GroupedConvDispatcherLib]] = []
+    loaded_cache: Dict[int, Optional[GroupedConvDispatcherLib]] = {}
+    for input_idx, unique_idx in enumerate(input_to_unique):
+        if unique_idx is None:
+            libs.append(None)
+            continue
+
+        if unique_idx in loaded_cache:
+            libs.append(loaded_cache[unique_idx])
+            continue
+
+        path = (
+            unique_lib_paths[unique_idx] if unique_idx < len(unique_lib_paths) else None
+        )
+        disp: Optional[GroupedConvDispatcherLib] = None
+        if path and path.exists():
+            try:
+                lib = ctypes.CDLL(str(path))
+                disp = GroupedConvDispatcherLib(lib, path)
+                disp.initialize()
+            except Exception as e:
+                if verbose:
+                    print(f"  FAIL [{input_idx}] failed to load {path}: {e}")
+        loaded_cache[unique_idx] = disp
+        libs.append(disp)
+
+    return libs
+
+
+def detect_gpu_arch() -> str:
+    """Detect GPU architecture using rocminfo."""
+    try:
+        out = subprocess.check_output(
+            ["rocminfo"], stderr=subprocess.DEVNULL, text=True
+        )
+        for line in out.split("\n"):
+            if "gfx" in line.lower() and "name:" in line.lower():
+                for part in line.split():
+                    if part.startswith("gfx"):
+                        return part
+    except Exception:
+        pass
+    return "gfx942"
diff --git a/dispatcher/python/requirements.txt b/dispatcher/python/requirements.txt
index 9d429235f7..3ed0a13de8 100644
--- a/dispatcher/python/requirements.txt
+++ b/dispatcher/python/requirements.txt
@@ -1,11 +1,16 @@
 # Core dependencies
 numpy>=1.19.0
 
+# ML Heuristic dependencies (OPTIONAL - large dependencies)
+# For ML-based kernel selection, install separately:
+#   pip install -r ../requirements-ml.txt
+# This avoids mandatory large dependencies (pyarrow, lightgbm) for users who don't need ML features
+
 # Optional dependencies (install with pip install -e ".[torch]")
 # torch>=2.0.0
 
 # Development dependencies (install with pip install -e ".[dev]")
-# pytest>=6.0.0
+pytest>=6.0.0
 # pytest-cov>=2.0.0
 # black>=21.0
 # flake8>=3.9.0
diff --git a/dispatcher/requirements-ml.txt b/dispatcher/requirements-ml.txt
new file mode 100644
index 0000000000..68f60a3d91
--- /dev/null
+++ b/dispatcher/requirements-ml.txt
@@ -0,0 +1,6 @@
+# ML Heuristic dependencies for ML-based kernel selection
+# Install with: pip install -r requirements-ml.txt
+lightgbm>=3.0.0
+pandas>=1.3.0
+pyarrow>=6.0.0
+scikit-learn>=0.24.0
diff --git a/dispatcher/scripts/compile_gemm_examples.py b/dispatcher/scripts/compile_gemm_examples.py
index b19c18a13a..98ba18ab51 100644
--- a/dispatcher/scripts/compile_gemm_examples.py
+++ b/dispatcher/scripts/compile_gemm_examples.py
@@ -94,17 +94,17 @@ def find_hipcc() -> str:
 
 
 def extract_conv_kernel_declarations(source_file: Path) -> list:
-    """Extract CONVOLUTION kernel declarations from C++ source file.
+    """Extract GROUPED CONVOLUTION kernel declarations from C++ source file.
 
-    Supports DECL_CONV_KERNEL_SET macro with ConvSig/ConvAlgo pattern.
+    Supports DECL_GROUPED_CONV_KERNEL_SET macro with ConvSig/ConvAlgo pattern.
     Extracts all parameters: dtype, layout, conv_type, dims, tile, wave, warp, pipeline, scheduler.
     """
     content = source_file.read_text()
     declarations = []
     seen = set()
 
-    # Pattern: DECL_CONV_KERNEL_SET(name, .add(...).add(...))
-    set_pattern = r"DECL_CONV_KERNEL_SET\s*\(\s*(\w+)\s*,([^;]+)\)"
+    # Pattern: DECL_GROUPED_CONV_KERNEL_SET(name, .add(...).add(...))
+    set_pattern = r"DECL_GROUPED_CONV_KERNEL_SET\s*\(\s*(\w+)\s*,([^;]+)\)"
 
     for match in re.finditer(set_pattern, content, re.DOTALL):
         set_name = match.group(1)
@@ -396,24 +396,23 @@ def expand_conv_declaration_with_arch_filter(decl: dict, arch: str = "gfx942") -
 
 
 def generate_conv_kernels(declarations: list, gpu_target: str = "gfx942") -> int:
-    """Generate convolution kernels using unified_conv_codegen."""
+    """Generate grouped convolution kernels using unified_grouped_conv_codegen."""
     kernel_dir = get_generated_kernels_dir()
     kernel_dir.mkdir(parents=True, exist_ok=True)
 
-    # Import conv codegen
     codegen_dir = get_dispatcher_root() / "codegen"
     sys.path.insert(0, str(codegen_dir))
 
     try:
-        from unified_conv_codegen import (
-            UnifiedConvCodegen,
-            ConvKernelConfig,
-            ConvVariant,
+        from unified_grouped_conv_codegen import (
+            UnifiedGroupedConvCodegen as UnifiedConvCodegen,
+            GroupedConvKernelConfig as ConvKernelConfig,
+            GroupedConvVariant as ConvVariant,
             TileConfig,
-            TraitConfig,
+            GroupedConvTraitConfig as TraitConfig,
         )
     except ImportError as e:
-        print_error(f"  Failed to import conv codegen: {e}")
+        print_error(f"  Failed to import grouped conv codegen: {e}")
         return 0
 
     codegen = UnifiedConvCodegen(kernel_dir)
@@ -1564,9 +1563,9 @@ def build_exact_conv_kernel_filename(decl: dict) -> str:
     if conv_type == "forward":
         type_prefix = "fwd"
     elif conv_type == "bwd_data":
-        type_prefix = "bwdd"
+        type_prefix = "bwd_data"
     elif conv_type == "bwd_weight":
-        type_prefix = "bwdw"
+        type_prefix = "bwd_weight"
     else:
         type_prefix = conv_type
 
@@ -1601,9 +1600,9 @@ def generate_specific_conv_kernel(decl: dict, gpu_target: str = "gfx942") -> boo
     else:
         variant = "forward"
 
-    # Use unified_conv_codegen
+    # Use unified_grouped_conv_codegen
     codegen_dir = get_dispatcher_root() / "codegen"
-    codegen_script = codegen_dir / "unified_conv_codegen.py"
+    codegen_script = codegen_dir / "unified_grouped_conv_codegen.py"
     output_dir = get_generated_kernels_dir()
 
     cmd = [
@@ -1661,9 +1660,9 @@ def find_conv_kernel_header(decl: dict, gpu_target: str = "gfx942") -> Path:
     if conv_type == "forward":
         type_prefix = "fwd"
     elif conv_type == "bwd_data":
-        type_prefix = "bwdd"
+        type_prefix = "bwd_data"
     elif conv_type == "bwd_weight":
-        type_prefix = "bwdw"
+        type_prefix = "bwd_weight"
     else:
         type_prefix = conv_type
 
@@ -1865,7 +1864,9 @@ In your C++ code, declare kernels like:
 
     if not gemm_declarations and not conv_declarations:
         print_error("  No kernel declarations found!")
-        print("  Add DECL_KERNEL_SET for GEMM or DECL_CONV_KERNEL_SET for Conv")
+        print(
+            "  Add DECL_KERNEL_SET for GEMM or DECL_GROUPED_CONV_KERNEL_SET for Grouped Conv"
+        )
         return 1
 
     # Handle GEMM declarations
@@ -1913,7 +1914,7 @@ In your C++ code, declare kernels like:
 
             is_valid, error_msg = validate_kernel_config(decl, arch)
             if not is_valid:
-                print(f"\n    ⚠ Invalid configuration: {decl_name}")
+                print(f"\n    WARNING Invalid configuration: {decl_name}")
 
                 # Parse the error and show specific auto-corrections
                 corrections = []
@@ -1926,7 +1927,7 @@ In your C++ code, declare kernels like:
                     decl["wave_m"] = -1
                     decl["wave_n"] = -1
                     corrections.append(
-                        f"wave: {original_values['wave']} → [wildcard expansion]"
+                        f"wave: {original_values['wave']} -> [wildcard expansion]"
                     )
 
                 if "warp tile" in error_msg.lower():
@@ -1936,7 +1937,7 @@ In your C++ code, declare kernels like:
                     decl["warp_m"] = -1
                     decl["warp_n"] = -1
                     corrections.append(
-                        f"warp_tile: {original_values['warp']} → [wildcard expansion]"
+                        f"warp_tile: {original_values['warp']} -> [wildcard expansion]"
                     )
 
                 if "trait combination" in error_msg.lower():
@@ -1945,16 +1946,16 @@ In your C++ code, declare kernels like:
                     decl["pipeline"] = "*"
                     decl["scheduler"] = "*"
                     corrections.append(
-                        f"pipeline: {original_values['pipeline']} → [wildcard expansion]"
+                        f"pipeline: {original_values['pipeline']} -> [wildcard expansion]"
                     )
                     corrections.append(
-                        f"scheduler: {original_values['scheduler']} → [wildcard expansion]"
+                        f"scheduler: {original_values['scheduler']} -> [wildcard expansion]"
                     )
 
                 # Print the auto-corrections
                 print("      AUTO-CORRECTION:")
                 for corr in corrections:
-                    print(f"        • {corr}")
+                    print(f"        - {corr}")
                 auto_corrections.append((decl_name, corrections))
 
                 invalid_count += 1
@@ -1962,15 +1963,15 @@ In your C++ code, declare kernels like:
 
         if invalid_count > 0:
             print(
-                f"\n    ⚠ {invalid_count} invalid config(s) auto-corrected via wildcard expansion"
+                f"\n    WARNING {invalid_count} invalid config(s) auto-corrected via wildcard expansion"
             )
 
         if wildcard_count > 0:
             print(
-                f"    ✓ {len(gemm_declarations) - wildcard_count} explicit + {wildcard_count} wildcard (will expand)"
+                f"    OK {len(gemm_declarations) - wildcard_count} explicit + {wildcard_count} wildcard (will expand)"
             )
         else:
-            print(f"    ✓ All {len(gemm_declarations)} configurations valid")
+            print(f"    OK All {len(gemm_declarations)} configurations valid")
 
         # Expand GEMM declarations (for wildcards)
         print("\n    Expanding wildcards to valid configurations...")
@@ -1994,7 +1995,7 @@ In your C++ code, declare kernels like:
                     wave_str = f"[{exp['wave_m']}, {exp['wave_n']}, {exp['wave_k']}]"
                     warp_str = f"[{exp['warp_m']}, {exp['warp_n']}, {exp['warp_k']}]"
                     print(
-                        f"        → wave={wave_str}, warp={warp_str}, pipeline={exp['pipeline']}, scheduler={exp['scheduler']}"
+                        f"        -> wave={wave_str}, warp={warp_str}, pipeline={exp['pipeline']}, scheduler={exp['scheduler']}"
                     )
                 if len(expanded) > 3:
                     print(f"        ... and {len(expanded) - 3} more")
@@ -2002,11 +2003,11 @@ In your C++ code, declare kernels like:
                 exp = expanded[0]
                 wave_str = f"[{exp['wave_m']}, {exp['wave_n']}, {exp['wave_k']}]"
                 warp_str = f"[{exp['warp_m']}, {exp['warp_n']}, {exp['warp_k']}]"
-                print(f"      {decl_name}: → wave={wave_str}, warp={warp_str}")
+                print(f"      {decl_name}: -> wave={wave_str}, warp={warp_str}")
 
         if len(expanded_gemm) > len(gemm_declarations):
             print(
-                f"\n    Total: {len(gemm_declarations)} declarations → {len(expanded_gemm)} configurations"
+                f"\n    Total: {len(gemm_declarations)} declarations -> {len(expanded_gemm)} configurations"
             )
 
         gemm_declarations = expanded_gemm
@@ -2054,7 +2055,7 @@ In your C++ code, declare kernels like:
 
             is_valid, error_msg = validate_conv_kernel_config(decl, arch)
             if not is_valid:
-                print(f"\n    ⚠ Invalid conv configuration: {decl_name}")
+                print(f"\n    WARNING Invalid conv configuration: {decl_name}")
 
                 # Parse the error and show specific auto-corrections
                 corrections = []
@@ -2067,7 +2068,7 @@ In your C++ code, declare kernels like:
                     decl["wave_m"] = -1
                     decl["wave_n"] = -1
                     corrections.append(
-                        f"wave: {original_values['wave']} → [wildcard expansion]"
+                        f"wave: {original_values['wave']} -> [wildcard expansion]"
                     )
 
                 if "warp tile" in error_msg.lower():
@@ -2077,7 +2078,7 @@ In your C++ code, declare kernels like:
                     decl["warp_m"] = -1
                     decl["warp_n"] = -1
                     corrections.append(
-                        f"warp_tile: {original_values['warp']} → [wildcard expansion]"
+                        f"warp_tile: {original_values['warp']} -> [wildcard expansion]"
                     )
 
                 if "trait combination" in error_msg.lower():
@@ -2086,16 +2087,16 @@ In your C++ code, declare kernels like:
                     decl["pipeline"] = "*"
                     decl["scheduler"] = "*"
                     corrections.append(
-                        f"pipeline: {original_values['pipeline']} → [wildcard expansion]"
+                        f"pipeline: {original_values['pipeline']} -> [wildcard expansion]"
                     )
                     corrections.append(
-                        f"scheduler: {original_values['scheduler']} → [wildcard expansion]"
+                        f"scheduler: {original_values['scheduler']} -> [wildcard expansion]"
                     )
 
                 # Print the auto-corrections
                 print("      AUTO-CORRECTION:")
                 for corr in corrections:
-                    print(f"        • {corr}")
+                    print(f"        - {corr}")
                 auto_corrections.append((decl_name, corrections))
 
                 invalid_count += 1
@@ -2103,15 +2104,15 @@ In your C++ code, declare kernels like:
 
         if invalid_count > 0:
             print(
-                f"\n    ⚠ {invalid_count} invalid config(s) auto-corrected via wildcard expansion"
+                f"\n    WARNING {invalid_count} invalid config(s) auto-corrected via wildcard expansion"
             )
 
         if wildcard_count > 0:
             print(
-                f"    ✓ {len(conv_declarations) - wildcard_count} explicit + {wildcard_count} wildcard (will expand)"
+                f"    OK {len(conv_declarations) - wildcard_count} explicit + {wildcard_count} wildcard (will expand)"
             )
         else:
-            print(f"    ✓ All {len(conv_declarations)} configurations valid")
+            print(f"    OK All {len(conv_declarations)} configurations valid")
 
         # Expand Conv declarations (for wildcards)
         print("\n    Expanding wildcards to valid configurations...")
@@ -2134,7 +2135,7 @@ In your C++ code, declare kernels like:
                     wave_str = f"[{exp['wave_m']}, {exp['wave_n']}, {exp['wave_k']}]"
                     warp_str = f"[{exp['warp_m']}, {exp['warp_n']}, {exp['warp_k']}]"
                     print(
-                        f"        → wave={wave_str}, warp={warp_str}, pipeline={exp['pipeline']}, scheduler={exp['scheduler']}"
+                        f"        -> wave={wave_str}, warp={warp_str}, pipeline={exp['pipeline']}, scheduler={exp['scheduler']}"
                     )
                 if len(expanded) > 3:
                     print(f"        ... and {len(expanded) - 3} more")
@@ -2142,11 +2143,11 @@ In your C++ code, declare kernels like:
                 exp = expanded[0]
                 wave_str = f"[{exp['wave_m']}, {exp['wave_n']}, {exp['wave_k']}]"
                 warp_str = f"[{exp['warp_m']}, {exp['warp_n']}, {exp['warp_k']}]"
-                print(f"      {decl_name}: → wave={wave_str}, warp={warp_str}")
+                print(f"      {decl_name}: -> wave={wave_str}, warp={warp_str}")
 
         if len(expanded_conv) > len(conv_declarations):
             print(
-                f"\n    Total: {len(conv_declarations)} declarations → {len(expanded_conv)} configurations"
+                f"\n    Total: {len(conv_declarations)} declarations -> {len(expanded_conv)} configurations"
             )
 
         conv_declarations = expanded_conv
diff --git a/dispatcher/scripts/compile_grouped_conv_examples.py b/dispatcher/scripts/compile_grouped_conv_examples.py
new file mode 100644
index 0000000000..32fe70a2de
--- /dev/null
+++ b/dispatcher/scripts/compile_grouped_conv_examples.py
@@ -0,0 +1,882 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Self-contained build script for C++ grouped convolution examples.
+
+Parses DECL_GROUPED_CONV_KERNEL_SET declarations from source files,
+generates the needed kernels, and compiles the example.
+
+Includes validation and auto-correction via wildcard expansion.
+
+Usage:
+    python3 compile_grouped_conv_examples.py examples/grouped_conv/cpp/02_grouped_conv_forward.cpp
+    python3 compile_grouped_conv_examples.py examples/grouped_conv/cpp/03_grouped_conv_validation.cpp --no-compile
+"""
+
+import argparse
+import os
+import re
+import subprocess
+import sys
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+from typing import Optional
+
+# Setup paths
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DISPATCHER_DIR = SCRIPT_DIR.parent
+CK_ROOT = DISPATCHER_DIR.parent
+
+sys.path.insert(0, str(DISPATCHER_DIR / "python"))
+sys.path.insert(0, str(DISPATCHER_DIR / "codegen"))
+
+from dispatcher_common import (  # noqa: E402
+    print_phase,
+    print_success,
+    print_error,
+    print_info,
+    find_hipcc,
+    get_arch_filter_data,
+    get_build_dir,
+    get_ck_root,
+    get_dispatcher_root,
+    get_generated_kernels_dir,
+)
+
+
+def extract_grouped_conv_declarations(source_file: Path) -> list:
+    """Extract DECL_GROUPED_CONV_KERNEL_SET declarations from C++ source."""
+    content = source_file.read_text()
+    declarations = []
+
+    # Pattern: DECL_GROUPED_CONV_KERNEL_SET(name, .add(...).add(...))
+    # Find all DECL_GROUPED_CONV_KERNEL_SET blocks by matching parentheses
+    pattern_start = r"DECL_GROUPED_CONV_KERNEL_SET\s*\(\s*(\w+)\s*,"
+    for match in re.finditer(pattern_start, content):
+        set_name = match.group(1)
+        start_pos = match.end()
+
+        # Find matching closing paren by counting parens
+        paren_count = 1  # We're already inside the first paren
+        end_pos = start_pos
+        for i, c in enumerate(content[start_pos:]):
+            if c == "(":
+                paren_count += 1
+            elif c == ")":
+                paren_count -= 1
+                if paren_count == 0:
+                    end_pos = start_pos + i
+                    break
+
+        set_body = content[start_pos:end_pos]
+
+        # Pattern 1: Simple add("dtype", "layout", "conv_type", tile_k, tile_c)
+        simple_add = (
+            r'\.add\s*\(\s*"(\w+)"\s*,\s*"(\w+)"\s*,\s*"(\w+)"\s*,\s*(\d+)\s*,\s*(\d+)'
+        )
+        for add_match in re.finditer(simple_add, set_body):
+            conv_type = add_match.group(3)
+            default_pipeline = (
+                "compv3" if conv_type in ("bwd_data", "bwd_weight") else "compv4"
+            )
+            declarations.append(
+                {
+                    "set": set_name,
+                    "dtype": add_match.group(1),
+                    "layout": add_match.group(2),
+                    "conv_type": conv_type,
+                    "tile_k": int(add_match.group(4)),
+                    "tile_c": int(add_match.group(5)),
+                    "num_dims": 2,
+                    "pipeline": default_pipeline,
+                    "scheduler": "intrawave",
+                    "wave_m": 2,
+                    "wave_n": 2,
+                    "wave_k": 1,
+                    "warp_m": 32,
+                    "warp_n": 32,
+                    "warp_k": 16,
+                    "arch": "gfx942",
+                }
+            )
+
+        # Pattern 2: Full ConvSig()/ConvAlgo() specification
+        # Find all .add( positions that start with ConvSig()
+        full_add = r"\.add\s*\(\s*ConvSig\(\)"
+        add_positions = [m.start() for m in re.finditer(full_add, set_body)]
+
+        for pos in add_positions:
+            # Find matching closing paren by counting parens
+            paren_count = 0
+            in_add = False
+            end = pos
+            for i, c in enumerate(set_body[pos:]):
+                if c == "(":
+                    paren_count += 1
+                    in_add = True
+                elif c == ")":
+                    paren_count -= 1
+                    if in_add and paren_count == 0:
+                        end = pos + i + 1
+                        break
+
+            add_str = set_body[pos:end]
+
+            # Extract signature part (between ConvSig() and ConvAlgo())
+            sig_match = re.search(r"ConvSig\(\)(.*?)ConvAlgo\(\)", add_str, re.DOTALL)
+            if not sig_match:
+                continue
+            sig_str = sig_match.group(1)
+
+            # Extract algorithm part (between ConvAlgo() and arch string)
+            algo_match = re.search(
+                r'ConvAlgo\(\)(.*?),\s*"(\w+)"\s*\)', add_str, re.DOTALL
+            )
+            if not algo_match:
+                continue
+            algo_str = algo_match.group(1)
+            arch = algo_match.group(2)
+
+            # Parse signature
+            dtype = "fp16"
+            dtype_match = re.search(r'\.dtype\s*\(\s*"(\w+)"', sig_str)
+            if dtype_match:
+                dtype = dtype_match.group(1)
+
+            layout = "nhwgc"
+            layout_match = re.search(r'\.layout\s*\(\s*"(\w+)"', sig_str)
+            if layout_match:
+                layout = layout_match.group(1)
+
+            conv_type = "forward"
+            conv_type_match = re.search(r'\.conv_type\s*\(\s*"(\w+)"', sig_str)
+            if conv_type_match:
+                conv_type = conv_type_match.group(1)
+
+            num_dims = 2
+            dims_match = re.search(r"\.dims\s*\(\s*(\d+)", sig_str)
+            if dims_match:
+                num_dims = int(dims_match.group(1))
+
+            # Parse algorithm
+            tile_k, tile_c = 128, 128
+            tile_match = re.search(
+                r"\.tile\s*\(\s*\d+\s*,\s*(\d+)\s*,\s*(\d+)", algo_str
+            )
+            if tile_match:
+                tile_k = int(tile_match.group(1))
+                tile_c = int(tile_match.group(2))
+
+            wave_m, wave_n, wave_k = 2, 2, 1
+            wave_match = re.search(
+                r"\.wave\s*\(\s*(\d+)\s*,\s*(\d+)(?:\s*,\s*(\d+))?", algo_str
+            )
+            if wave_match:
+                wave_m = int(wave_match.group(1))
+                wave_n = int(wave_match.group(2))
+                wave_k = int(wave_match.group(3) or 1)
+
+            warp_m, warp_n, warp_k = 32, 32, 16
+            warp_match = re.search(
+                r"\.warp\s*\(\s*(\d+)\s*,\s*(\d+)(?:\s*,\s*(\d+))?", algo_str
+            )
+            if warp_match:
+                warp_m = int(warp_match.group(1))
+                warp_n = int(warp_match.group(2))
+                warp_k = int(warp_match.group(3) or 16)
+
+            pipeline = "compv4"
+            pipeline_match = re.search(r'\.pipeline\s*\(\s*"(\w+)"', algo_str)
+            if pipeline_match:
+                pipeline = pipeline_match.group(1)
+
+            scheduler = "intrawave"
+            scheduler_match = re.search(r'\.scheduler\s*\(\s*"(\w+)"', algo_str)
+            if scheduler_match:
+                scheduler = scheduler_match.group(1)
+
+            # Parse additional parameters
+            vector_a, vector_b, vector_c = 4, 8, 8
+            vector_match = re.search(
+                r"\.vector_sizes\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)", algo_str
+            )
+            if vector_match:
+                vector_a = int(vector_match.group(1))
+                vector_b = int(vector_match.group(2))
+                vector_c = int(vector_match.group(3))
+
+            block_per_cu = 1
+            block_per_cu_match = re.search(r"\.block_per_cu\s*\(\s*(\d+)", algo_str)
+            if block_per_cu_match:
+                block_per_cu = int(block_per_cu_match.group(1))
+
+            memory_op = "set"
+            memory_op_match = re.search(r'\.memory_op\s*\(\s*"(\w+)"', algo_str)
+            if memory_op_match:
+                memory_op = memory_op_match.group(1)
+
+            epilogue = "cshuffle"
+            epilogue_match = re.search(r'\.epilogue\s*\(\s*"(\w+)"', algo_str)
+            if epilogue_match:
+                epilogue = epilogue_match.group(1)
+
+            # Parse num_wave_groups (for V5 pipeline)
+            num_wave_groups = 1
+            nwg_match = re.search(r"\.num_wave_groups\s*\(\s*(\d+)", algo_str)
+            if nwg_match:
+                num_wave_groups = int(nwg_match.group(1))
+
+            # Parse num_groups_to_merge (for merged group grouped convolution)
+            num_groups_to_merge = 1
+            ngm_match = re.search(r"\.num_groups_to_merge\s*\(\s*(\d+)", algo_str)
+            if ngm_match:
+                num_groups_to_merge = int(ngm_match.group(1))
+
+            # Parse double_smem_buffer (for V4 pipeline)
+            double_smem_buffer = False
+            dsb_match = re.search(
+                r"\.double_smem_buffer\s*\(\s*(true|false)", algo_str, re.I
+            )
+            if dsb_match:
+                double_smem_buffer = dsb_match.group(1).lower() == "true"
+
+            # Parse padding flags
+            pad_m, pad_n, pad_k = True, True, True
+            padding_match = re.search(
+                r"\.padding\s*\(\s*(true|false)\s*,\s*(true|false)\s*,\s*(true|false)",
+                algo_str,
+                re.I,
+            )
+            if padding_match:
+                pad_m = padding_match.group(1).lower() == "true"
+                pad_n = padding_match.group(2).lower() == "true"
+                pad_k = padding_match.group(3).lower() == "true"
+
+            declarations.append(
+                {
+                    "set": set_name,
+                    "dtype": dtype,
+                    "layout": layout,
+                    "conv_type": conv_type,
+                    "tile_k": tile_k,
+                    "tile_c": tile_c,
+                    "num_dims": num_dims,
+                    "pipeline": pipeline,
+                    "scheduler": scheduler,
+                    "wave_m": wave_m,
+                    "wave_n": wave_n,
+                    "wave_k": wave_k,
+                    "warp_m": warp_m,
+                    "warp_n": warp_n,
+                    "warp_k": warp_k,
+                    "vector_a": vector_a,
+                    "vector_b": vector_b,
+                    "vector_c": vector_c,
+                    "block_per_cu": block_per_cu,
+                    "memory_op": memory_op,
+                    "epilogue": epilogue,
+                    "num_wave_groups": num_wave_groups,
+                    "num_groups_to_merge": num_groups_to_merge,
+                    "double_smem_buffer": double_smem_buffer,
+                    "pad_m": pad_m,
+                    "pad_n": pad_n,
+                    "pad_k": pad_k,
+                    "arch": arch,
+                }
+            )
+
+    return declarations
+
+
+# =============================================================================
+# VALIDATION AND AUTO-CORRECTION
+# =============================================================================
+
+
+def is_grouped_conv_wildcard_declaration(decl: dict) -> bool:
+    """Check if a declaration uses wildcards (-1 or '*')."""
+    wildcard_fields = ["wave_m", "wave_n", "warp_m", "warp_n", "pipeline", "scheduler"]
+    for field in wildcard_fields:
+        val = decl.get(field)
+        if val == -1 or val == "*":
+            return True
+    return False
+
+
+def validate_grouped_conv_kernel_config(decl: dict, arch: str = "gfx942") -> tuple:
+    """Validate a grouped conv kernel configuration against known supported combinations.
+
+    Returns: (is_valid, error_message)
+    """
+    # Skip validation for wildcards - expansion will filter invalid combos
+    if is_grouped_conv_wildcard_declaration(decl):
+        return (True, None)
+
+    arch_data = get_arch_filter_data()
+
+    pipeline = decl.get("pipeline", "compv4")
+    scheduler = decl.get("scheduler", "intrawave")
+    dtype = decl.get("dtype", "fp16")
+
+    wave_m = decl.get("wave_m", 2)
+    wave_n = decl.get("wave_n", 2)
+    wave_k = decl.get("wave_k", 1)
+
+    warp_m = decl.get("warp_m", 32)
+    warp_n = decl.get("warp_n", 32)
+    warp_k = decl.get("warp_k", 16)
+
+    errors = []
+
+    # Check trait combination (pipeline, epilogue, scheduler)
+    combo = (pipeline, "cshuffle", scheduler)
+    if combo in arch_data["trait_unsupported"]:
+        errors.append(
+            f"Unsupported trait combination: pipeline={pipeline}, scheduler={scheduler}\n"
+            f"    Valid schedulers for {pipeline}: intrawave"
+        )
+
+    # Check wave configuration for this arch
+    warp_combos = arch_data["warp_combos"].get(arch, [[2, 2, 1]])
+    wave_cfg = [wave_m, wave_n, wave_k]
+    if wave_cfg not in warp_combos:
+        valid_str = ", ".join(f"[{c[0]},{c[1]},{c[2]}]" for c in warp_combos)
+        errors.append(
+            f"Unsupported wave configuration [{wave_m},{wave_n},{wave_k}] for {arch}\n"
+            f"    Valid wave configs: {valid_str}"
+        )
+
+    # Check warp tile configuration for this arch and dtype
+    acc_dtype = "int32" if dtype == "int8" else "fp32"
+    dtype_key = f"{dtype}_{dtype}_{acc_dtype}"
+    warp_tile_combos = (
+        arch_data["warp_tile_combos"]
+        .get(arch, {})
+        .get(dtype_key, [[32, 32, 16], [16, 16, 16], [16, 16, 32]])
+    )
+    warp_cfg = [warp_m, warp_n, warp_k]
+    if warp_cfg not in warp_tile_combos:
+        valid_str = ", ".join(f"[{c[0]},{c[1]},{c[2]}]" for c in warp_tile_combos[:5])
+        errors.append(
+            f"Unsupported warp tile [{warp_m},{warp_n},{warp_k}] for {arch}/{dtype}\n"
+            f"    Valid warp tiles: {valid_str}"
+        )
+
+    # Check arch is supported
+    if arch not in arch_data["supported_archs"]:
+        errors.append(
+            f"Unsupported architecture: {arch}\n"
+            f"    Supported: {', '.join(arch_data['supported_archs'])}"
+        )
+
+    if errors:
+        return (False, "\n".join(errors))
+
+    return (True, None)
+
+
+def expand_grouped_conv_declaration_with_arch_filter(
+    decl: dict, arch: str = "gfx942"
+) -> list:
+    """Expand a grouped conv declaration with wildcards into valid configurations.
+
+    Wildcards:
+      - wave_m/wave_n = -1: Try all valid wave configs for this arch
+      - warp_m/warp_n = -1: Try all valid warp tiles for this arch/dtype
+      - pipeline/scheduler = "*": Try all valid combinations
+
+    Returns a list of fully-specified declarations.
+    """
+    arch_data = get_arch_filter_data()
+    dtype = decl.get("dtype", "fp16")
+
+    # Get valid combinations for this arch
+    valid_wave_combos = arch_data["warp_combos"].get(arch, [[2, 2, 1]])
+    acc_dtype = "int32" if dtype == "int8" else "fp32"
+    dtype_key = f"{dtype}_{dtype}_{acc_dtype}"
+    valid_warp_tiles = (
+        arch_data["warp_tile_combos"]
+        .get(arch, {})
+        .get(dtype_key, [[32, 32, 16], [16, 16, 16]])
+    )
+
+    # Valid pipelines and schedulers
+    valid_pipelines = ["compv3", "compv4"]
+    valid_schedulers = ["intrawave"]  # interwave often unsupported
+
+    # Determine which fields need expansion
+    expand_wave = decl.get("wave_m", 2) == -1 or decl.get("wave_n", 2) == -1
+    expand_warp = decl.get("warp_m", 32) == -1 or decl.get("warp_n", 32) == -1
+    expand_pipeline = decl.get("pipeline", "compv4") == "*"
+    expand_scheduler = decl.get("scheduler", "intrawave") == "*"
+
+    # Build combinations
+    wave_options = (
+        valid_wave_combos
+        if expand_wave
+        else [[decl.get("wave_m", 2), decl.get("wave_n", 2), decl.get("wave_k", 1)]]
+    )
+    warp_options = (
+        valid_warp_tiles
+        if expand_warp
+        else [[decl.get("warp_m", 32), decl.get("warp_n", 32), decl.get("warp_k", 16)]]
+    )
+    pipeline_options = (
+        valid_pipelines if expand_pipeline else [decl.get("pipeline", "compv4")]
+    )
+    scheduler_options = (
+        valid_schedulers if expand_scheduler else [decl.get("scheduler", "intrawave")]
+    )
+
+    expanded = []
+    for wave in wave_options:
+        for warp in warp_options:
+            for pipeline in pipeline_options:
+                for scheduler in scheduler_options:
+                    # Skip known invalid combinations
+                    if (pipeline, "cshuffle", scheduler) in arch_data[
+                        "trait_unsupported"
+                    ]:
+                        continue
+
+                    new_decl = decl.copy()
+                    new_decl["wave_m"] = wave[0]
+                    new_decl["wave_n"] = wave[1]
+                    new_decl["wave_k"] = wave[2]
+                    new_decl["warp_m"] = warp[0]
+                    new_decl["warp_n"] = warp[1]
+                    new_decl["warp_k"] = warp[2]
+                    new_decl["pipeline"] = pipeline
+                    new_decl["scheduler"] = scheduler
+
+                    expanded.append(new_decl)
+
+    # If no valid expansions, return original (will fail validation later)
+    if not expanded:
+        return [decl]
+
+    # Return first valid config (or all if needed)
+    return expanded[:1]  # Just use first valid config for grouped conv
+
+
+def validate_and_expand_grouped_conv_declarations(
+    declarations: list, arch: str, verbose: bool = False
+) -> list:
+    """Validate declarations and auto-correct invalid ones via wildcard expansion."""
+    print(f"\n    Validating against {arch} arch filter...")
+
+    wildcard_count = 0
+    invalid_count = 0
+    auto_corrections = []
+
+    for decl in declarations:
+        decl_arch = decl.get("arch", arch)
+        decl_name = (
+            f"{decl['dtype']}_{decl['conv_type']}_{decl['tile_k']}x{decl['tile_c']}"
+        )
+
+        # Check for wildcards
+        if is_grouped_conv_wildcard_declaration(decl):
+            wildcard_count += 1
+            continue
+
+        is_valid, error_msg = validate_grouped_conv_kernel_config(decl, decl_arch)
+        if not is_valid:
+            print(f"\n    WARNING Invalid grouped conv configuration: {decl_name}")
+
+            # Parse the error and show specific auto-corrections
+            corrections = []
+            original_values = {}
+
+            if "wave configuration" in error_msg.lower():
+                original_values["wave"] = (
+                    f"[{decl.get('wave_m', 2)}, {decl.get('wave_n', 2)}, {decl.get('wave_k', 1)}]"
+                )
+                decl["wave_m"] = -1
+                decl["wave_n"] = -1
+                corrections.append(
+                    f"wave: {original_values['wave']} -> [wildcard expansion]"
+                )
+
+            if "warp tile" in error_msg.lower():
+                original_values["warp"] = (
+                    f"[{decl.get('warp_m', 32)}, {decl.get('warp_n', 32)}, {decl.get('warp_k', 16)}]"
+                )
+                decl["warp_m"] = -1
+                decl["warp_n"] = -1
+                corrections.append(
+                    f"warp_tile: {original_values['warp']} -> [wildcard expansion]"
+                )
+
+            if "trait combination" in error_msg.lower():
+                original_values["pipeline"] = decl.get("pipeline", "compv4")
+                original_values["scheduler"] = decl.get("scheduler", "intrawave")
+                decl["pipeline"] = "*"
+                decl["scheduler"] = "*"
+                corrections.append(
+                    f"pipeline: {original_values['pipeline']} -> [wildcard expansion]"
+                )
+                corrections.append(
+                    f"scheduler: {original_values['scheduler']} -> [wildcard expansion]"
+                )
+
+            # Print the auto-corrections
+            print("      AUTO-CORRECTION:")
+            for corr in corrections:
+                print(f"        - {corr}")
+            auto_corrections.append((decl_name, corrections))
+
+            invalid_count += 1
+            wildcard_count += 1
+
+    if invalid_count > 0:
+        print(
+            f"\n    WARNING {invalid_count} invalid config(s) auto-corrected via wildcard expansion"
+        )
+
+    if wildcard_count > 0:
+        print(
+            f"    OK {len(declarations) - wildcard_count} explicit + {wildcard_count} wildcard (will expand)"
+        )
+    else:
+        print(f"    OK All {len(declarations)} configurations valid")
+
+    # Expand wildcards
+    print("\n    Expanding wildcards to valid configurations...")
+    expanded_declarations = []
+    for decl in declarations:
+        decl_arch = decl.get("arch", arch)
+        decl_name = (
+            f"{decl['dtype']}_{decl['conv_type']}_{decl['tile_k']}x{decl['tile_c']}"
+        )
+
+        expanded = expand_grouped_conv_declaration_with_arch_filter(decl, decl_arch)
+        expanded_declarations.extend(expanded)
+
+        if len(expanded) > 1:
+            print(
+                f"      {decl_name}: expanded to {len(expanded)} valid configurations"
+            )
+            for exp in expanded[:3]:
+                wave_str = f"[{exp['wave_m']}, {exp['wave_n']}, {exp['wave_k']}]"
+                warp_str = f"[{exp['warp_m']}, {exp['warp_n']}, {exp['warp_k']}]"
+                print(
+                    f"        -> wave={wave_str}, warp={warp_str}, pipeline={exp['pipeline']}"
+                )
+            if len(expanded) > 3:
+                print(f"        ... and {len(expanded) - 3} more")
+        elif is_grouped_conv_wildcard_declaration(decl) and len(expanded) == 1:
+            exp = expanded[0]
+            wave_str = f"[{exp['wave_m']}, {exp['wave_n']}, {exp['wave_k']}]"
+            warp_str = f"[{exp['warp_m']}, {exp['warp_n']}, {exp['warp_k']}]"
+            print(f"      {decl_name}: -> wave={wave_str}, warp={warp_str}")
+
+    if len(expanded_declarations) != len(declarations):
+        print(
+            f"\n    Total: {len(declarations)} declarations -> {len(expanded_declarations)} configurations"
+        )
+
+    return expanded_declarations
+
+
+def _generate_single_grouped_conv_kernel(args: tuple) -> tuple:
+    """Generate one grouped conv kernel (picklable for ProcessPoolExecutor).
+
+    Args: (decl, output_dir_str, gpu_target)
+    Returns: (idx, filepath_str or None, error_str or None)
+    """
+    decl, output_dir_str, gpu_target = args
+    output_dir = Path(output_dir_str)
+    idx = decl.get("_idx", 0)
+
+    try:
+        from codegen_common import TileConfig
+        from unified_grouped_conv_codegen import (
+            GroupedConvKernelConfig,
+            GroupedConvTraitConfig,
+            GroupedConvVariant,
+            UnifiedGroupedConvCodegen,
+        )
+
+        # Map conv_type to variant
+        variant = GroupedConvVariant.FORWARD
+        if decl["conv_type"] == "bwd_data":
+            variant = GroupedConvVariant.BACKWARD_DATA
+        elif decl["conv_type"] == "bwd_weight":
+            variant = GroupedConvVariant.BACKWARD_WEIGHT
+
+        pipeline = decl.get("pipeline", "compv4")
+        adj_tile_k = 64 * 2 if pipeline == "compv4" else 64
+
+        # Create tile config (tile_m=tile_k, tile_n=tile_c for conv GEMM view)
+        tile = TileConfig(
+            tile_m=decl["tile_k"],
+            tile_n=decl["tile_c"],
+            tile_k=adj_tile_k,
+            warp_m=decl["wave_m"],
+            warp_n=decl["wave_n"],
+            warp_k=decl.get("wave_k", 1),
+            warp_tile_m=decl["warp_m"],
+            warp_tile_n=decl["warp_n"],
+            warp_tile_k=decl["warp_k"],
+        )
+
+        trait = GroupedConvTraitConfig(
+            pipeline=pipeline,
+            scheduler=decl["scheduler"],
+            epilogue=decl.get("epilogue", "cshuffle"),
+            double_smem_buffer=decl.get("double_smem_buffer", False),
+            pad_m=decl.get("pad_m", True),
+            pad_n=decl.get("pad_n", True),
+            pad_k=decl.get("pad_k", True),
+            num_groups_to_merge=decl.get("num_groups_to_merge", 1),
+        )
+
+        config = GroupedConvKernelConfig(
+            tile=tile,
+            trait=trait,
+            variant=variant,
+            ndim_spatial=decl["num_dims"],
+            arch=decl.get("arch", gpu_target),
+            vector_size_a=decl.get("vector_a", 4),
+            vector_size_b=decl.get("vector_b", 8),
+            vector_size_c=decl.get("vector_c", 8),
+            block_per_cu=decl.get("block_per_cu", 1),
+            num_wave_groups=decl.get("num_wave_groups", 1),
+            num_groups_to_merge=decl.get("num_groups_to_merge", 1),
+            double_smem_buffer=decl.get("double_smem_buffer", False),
+        )
+
+        codegen = UnifiedGroupedConvCodegen(output_dir, gpu_target=gpu_target)
+        kernel_path, _ = codegen.generate_kernel(config, decl["dtype"], variant)
+        return (idx, str(kernel_path), None)
+
+    except Exception as e:
+        return (idx, None, str(e))
+
+
+def generate_grouped_conv_kernels(
+    declarations: list,
+    output_dir: Path,
+    gpu_target: str = "gfx942",
+    max_workers: Optional[int] = None,
+) -> list:
+    """Generate grouped convolution kernels using unified_grouped_conv_codegen.
+
+    Uses ProcessPoolExecutor for parallel kernel generation.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Prepare work items (add _idx for ordering)
+    work_items = []
+    for idx, decl in enumerate(declarations):
+        decl_copy = decl.copy()
+        decl_copy["_idx"] = idx
+        work_items.append((decl_copy, str(output_dir), gpu_target))
+
+    max_workers = max_workers or min(len(work_items), os.cpu_count() or 4)
+    generated = []
+    failed = []
+
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(_generate_single_grouped_conv_kernel, w): w[0]["_idx"]
+            for w in work_items
+        }
+        for future in as_completed(futures):
+            idx, path, err = future.result()
+            if path:
+                generated.append(Path(path))
+                print_info(f"    Generated: {Path(path).name}")
+            else:
+                failed.append((idx, err))
+                print_error(f"    Failed kernel {idx + 1}: {err}")
+
+    if failed:
+        for idx, err in failed[:3]:
+            print_error(f"  Kernel {idx + 1}: {err[:200]}")
+        if len(failed) > 3:
+            print_error(f"  ... and {len(failed) - 3} more failures")
+
+    return generated
+
+
+def compile_grouped_conv_example(
+    source_file: Path,
+    output_bin: Path,
+    kernel_headers: list,
+    hipcc: str,
+    gpu_target: str,
+) -> bool:
+    """Compile the C++ example with generated kernels."""
+    kernel_dir = get_generated_kernels_dir()
+    ck_root = get_ck_root()
+    dispatcher_dir = get_dispatcher_root()
+
+    includes = [
+        f"-I{ck_root / 'include'}",
+        f"-I{dispatcher_dir / 'include'}",
+        f"-I{kernel_dir}",
+    ]
+
+    # Build include flags for generated kernels
+    kernel_includes = []
+    for header in kernel_headers:
+        kernel_includes.extend(["-include", str(header)])
+
+    # Add define to indicate kernels are available
+    defines = ["-DGROUPED_CONV_KERNEL_AVAILABLE=1"]
+
+    cmd = [
+        hipcc,
+        "-std=c++20",
+        "-O2",
+        f"--offload-arch={gpu_target}",
+        *includes,
+        *defines,
+        *kernel_includes,
+        "-o",
+        str(output_bin),
+        str(source_file),
+    ]
+
+    print_info(f"  Compiling: {source_file.name}")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        if result.stderr:
+            lines = result.stderr.split("\n")
+            errors = [line for line in lines if "error:" in line.lower()][:5]
+            for err_line in errors:
+                print_error(f"    {err_line}")
+        return False
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build C++ grouped convolution example with self-contained kernel generation"
+    )
+    parser.add_argument("source", help="Source file (.cpp)")
+    parser.add_argument("--output", "-o", help="Output binary name")
+    parser.add_argument("--gpu-target", default="gfx942", help="GPU target")
+    parser.add_argument(
+        "--no-compile", action="store_true", help="Only generate kernels, don't compile"
+    )
+    parser.add_argument("--verbose", "-v", action="store_true")
+    parser.add_argument(
+        "--jobs",
+        "-j",
+        type=int,
+        default=None,
+        help="Parallel jobs for kernel generation (default: cpu_count)",
+    )
+    args = parser.parse_args()
+
+    # Resolve source file
+    source_file = Path(args.source)
+    if not source_file.is_absolute():
+        candidates = [
+            get_dispatcher_root() / args.source,
+            Path.cwd() / args.source,
+        ]
+        for c in candidates:
+            if c.exists():
+                source_file = c
+                break
+
+    if not source_file.exists():
+        print_error(f"Source file not found: {source_file}")
+        return 1
+
+    build_dir = get_build_dir()
+    kernel_dir = get_generated_kernels_dir()
+    output_name = args.output or source_file.stem
+    output_bin = build_dir / output_name
+
+    print_success("=== Grouped Conv Example Builder (Self-Contained) ===")
+
+    # Phase 1: Extract declarations
+    print_phase(1, "Scanning for DECL_GROUPED_CONV_KERNEL_SET...")
+    declarations = extract_grouped_conv_declarations(source_file)
+
+    if not declarations:
+        print_error("  No DECL_GROUPED_CONV_KERNEL_SET declarations found!")
+        return 1
+
+    print(f"  Found {len(declarations)} kernel declaration(s):")
+    for decl in declarations:
+        name = f"{decl['dtype']}_{decl['conv_type']}_{decl['num_dims']}d_{decl['tile_k']}x{decl['tile_c']}"
+        print(f"    [{decl['set']}] {name}")
+
+    # Phase 2: Validate and expand
+    print_phase(2, "Validating and expanding declarations...")
+    declarations = validate_and_expand_grouped_conv_declarations(
+        declarations, args.gpu_target, args.verbose
+    )
+    print()
+
+    # Phase 3: Generate kernels
+    print_phase(3, "Generating kernels...")
+    generated = generate_grouped_conv_kernels(
+        declarations, kernel_dir, args.gpu_target, max_workers=args.jobs
+    )
+
+    if not generated:
+        print_error("  No kernels generated!")
+        return 1
+
+    print(f"  Generated {len(generated)} kernel file(s)")
+    print()
+
+    # Phase 4: Compile (optional)
+    if args.no_compile:
+        print_info("Skipping compilation (--no-compile)")
+        print()
+        print_success("=== Kernel Generation Complete ===")
+        print(f"Kernels in: {kernel_dir}")
+        return 0
+
+    print_phase(4, "Compiling example...")
+    hipcc_path = find_hipcc()
+
+    if not hipcc_path:
+        print_error("  hipcc not found. Install ROCm or set HIPCC env var.")
+        print("  To compile manually:")
+        ck_root = get_dispatcher_root().parent
+        print(
+            f"    hipcc -std=c++20 -O2 -I{ck_root / 'include'} -I{get_dispatcher_root() / 'include'} \\"
+        )
+        print(f"          -I{kernel_dir} \\")
+        for h in generated[:1]:
+            print(f"          -include {h} \\")
+        print("          -DGROUPED_CONV_KERNEL_AVAILABLE=1 \\")
+        print(f"          --offload-arch={args.gpu_target} \\")
+        print(f"          {source_file} -o {output_bin}")
+        return 1
+
+    build_dir.mkdir(parents=True, exist_ok=True)
+
+    if not compile_grouped_conv_example(
+        source_file, output_bin, generated, hipcc_path, args.gpu_target
+    ):
+        print_error("  Compilation failed!")
+        return 1
+
+    print_success(f"  Output: {output_bin}")
+    print()
+
+    print_success("=== Build Complete ===")
+    print()
+    print("Run with:")
+    print(f"  {output_bin}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/scripts/example_kernel_builder.py b/dispatcher/scripts/example_kernel_builder.py
index d3bb619174..20952cd91f 100755
--- a/dispatcher/scripts/example_kernel_builder.py
+++ b/dispatcher/scripts/example_kernel_builder.py
@@ -55,10 +55,10 @@ def extract_balanced_parens(text: str, start_pos: int) -> str:
 
 
 def parse_conv_declarations(content: str) -> List[Dict]:
-    """Parse DECL_CONV_KERNEL_SET declarations with all parameters."""
+    """Parse DECL_GROUPED_CONV_KERNEL_SET declarations with all parameters."""
     kernels = []
 
-    for match in re.finditer(r"DECL_CONV_KERNEL_SET\s*\(", content):
+    for match in re.finditer(r"DECL_GROUPED_CONV_KERNEL_SET\s*\(", content):
         body = extract_balanced_parens(content, match.end() - 1)
         if not body:
             continue
@@ -619,7 +619,7 @@ def strip_cpp_strings_and_comments(content: str) -> str:
     n = len(content)
 
     # Patterns that indicate a string is problematic and should be stripped
-    problematic_patterns = ["DECL_KERNEL_SET", "DECL_CONV_KERNEL_SET", ".add("]
+    problematic_patterns = ["DECL_KERNEL_SET", "DECL_GROUPED_CONV_KERNEL_SET", ".add("]
 
     while i < n:
         # Check for raw string literal: R"delimiter(...)delimiter"
@@ -697,7 +697,7 @@ def detect_and_parse(source_path: Path) -> Tuple[str, List[Dict]]:
     content = source_path.read_text()
     content = strip_cpp_strings_and_comments(content)
 
-    if "DECL_CONV_KERNEL_SET" in content:
+    if "DECL_GROUPED_CONV_KERNEL_SET" in content:
         return "conv", parse_conv_declarations(content)
     elif "DECL_KERNEL_SET" in content:
         return "gemm", parse_gemm_declarations(content)
@@ -966,30 +966,128 @@ def generate_per_set_functions(source_stem: str) -> str:
 def generate_conv_registration(
     kernel_headers: List[Path], example_name: str, kernels: List[Dict]
 ) -> str:
-    """Generate Conv kernel registration code for the dispatcher registry."""
+    """Generate Conv kernel registration code for the dispatcher registry.
+
+    Creates real GroupedConvKernelInstance entries backed by the generated
+    launcher's launch() method via the conv backend RunFn factories.
+    """
     if not kernel_headers:
         return "    // No kernels to register"
 
     lines = []
-    lines.append(
-        "    (void)registry; (void)arch; // Conv uses direct launcher pattern for now"
-    )
 
-    # For conv, we provide direct access to kernel launchers
     for i, h in enumerate(kernel_headers):
-        kernel_name = h.stem
-        lines.append(f"    // Kernel {i + 1}: {kernel_name}")
+        kname = h.stem
+        ns = f"ns_{kname}"
+        launcher = f"{ns}::{kname}_Launcher"
+
+        # Determine direction and ndim from the kernel header name
+        if "_fwd_" in kname:
+            direction = "Forward"
+            run_fn_factory = "make_conv_fwd_run_fn"
+        elif "_bwd_data_" in kname or "_bwdd_" in kname:
+            direction = "BackwardData"
+            run_fn_factory = "make_conv_bwd_data_run_fn"
+        elif "_bwd_weight_" in kname or "_bwdw_" in kname:
+            direction = "BackwardWeight"
+            run_fn_factory = "make_conv_bwd_weight_run_fn"
+        else:
+            direction = "Forward"
+            run_fn_factory = "make_conv_fwd_run_fn"
+
+        ndim = 3 if "_3d_" in kname else 2
+
+        # Parse dtype from name (e.g. grouped_conv_fwd_fp16_...)
+        dtype = "fp16"
+        for dt in ["fp16", "bf16", "fp32"]:
+            if f"_{dt}_" in kname:
+                dtype = dt
+                break
+
+        # Parse tile, wave, warp from name.
+        # Format: ..._TILExTILExTILE_WAVExWAVExWAVE_WARPxWARPxWARP_...
+        import re as _re
+
+        tile_m, tile_n, tile_k = 1, 128, 128
+        wave_m, wave_n, wave_k = 2, 2, 1
+        warp_m, warp_n, warp_k = 32, 32, 16
+
+        triplets = _re.findall(r"_(\d+)x(\d+)x(\d+)", kname)
+        if len(triplets) >= 1:
+            tile_m, tile_n, tile_k = (
+                int(triplets[0][0]),
+                int(triplets[0][1]),
+                int(triplets[0][2]),
+            )
+        if len(triplets) >= 2:
+            wave_m, wave_n, wave_k = (
+                int(triplets[1][0]),
+                int(triplets[1][1]),
+                int(triplets[1][2]),
+            )
+        if len(triplets) >= 3:
+            warp_m, warp_n, warp_k = (
+                int(triplets[2][0]),
+                int(triplets[2][1]),
+                int(triplets[2][2]),
+            )
+
+        pipeline = "compv4" if "compv4" in kname else "compv3"
+        scheduler = "interwave" if "interwave" in kname else "intrawave"
+        epilogue = "cshuffle" if "cshuffle" in kname else "default"
+
+        # ConvConfigBase defaults
+        vec_a, vec_b, vec_c = 4, 8, 8
+        block_per_cu = 1
+        num_wave_groups = 1
+        num_groups_to_merge = 1
+
+        lines.append(f"    // Kernel {i + 1}: {kname}")
+        lines.append("    {")
+        lines.append(f"        ck_tile::dispatcher::GroupedConvKernelKey key_{i};")
+        lines.append(f'        key_{i}.dtype_in     = "{dtype}";')
+        lines.append(f'        key_{i}.dtype_wei    = "{dtype}";')
+        lines.append(f'        key_{i}.dtype_out    = "{dtype}";')
+        lines.append(f'        key_{i}.layout       = "nhwgc";')
+        lines.append(f"        key_{i}.ndim_spatial = {ndim};")
+        lines.append(
+            f"        key_{i}.op           = ck_tile::dispatcher::GroupedConvOp::{direction};"
+        )
+        lines.append(f"        key_{i}.tile_m       = {tile_m};")
+        lines.append(f"        key_{i}.tile_n       = {tile_n};")
+        lines.append(f"        key_{i}.tile_k       = {tile_k};")
+        lines.append(f"        key_{i}.wave_m       = {wave_m};")
+        lines.append(f"        key_{i}.wave_n       = {wave_n};")
+        lines.append(f"        key_{i}.wave_k       = {wave_k};")
+        lines.append(f"        key_{i}.warp_m       = {warp_m};")
+        lines.append(f"        key_{i}.warp_n       = {warp_n};")
+        lines.append(f"        key_{i}.warp_k       = {warp_k};")
+        lines.append(f'        key_{i}.pipeline     = "{pipeline}";')
+        lines.append(f'        key_{i}.scheduler    = "{scheduler}";')
+        lines.append(f'        key_{i}.epilogue     = "{epilogue}";')
+        lines.append(f"        key_{i}.vector_size_a      = {vec_a};")
+        lines.append(f"        key_{i}.vector_size_b      = {vec_b};")
+        lines.append(f"        key_{i}.vector_size_c      = {vec_c};")
+        lines.append(f"        key_{i}.block_per_cu       = {block_per_cu};")
+        lines.append(f"        key_{i}.num_wave_groups    = {num_wave_groups};")
+        lines.append(f"        key_{i}.num_groups_to_merge = {num_groups_to_merge};")
+        lines.append(f"        key_{i}.arch         = arch;")
+        lines.append(
+            f"        auto run_fn_{i} = ck_tile::dispatcher::backends::{run_fn_factory}<{launcher}, {ndim}>();"
+        )
+        lines.append(
+            f'        auto inst_{i} = std::make_shared<ck_tile::dispatcher::GroupedConvKernelInstance>(key_{i}, "{kname}", std::move(run_fn_{i}));'
+        )
+        lines.append(f"        registry.register_kernel(key_{i}, inst_{i});")
+        lines.append("    }")
 
     return "\n".join(lines)
 
 
-def generate_conv_kernels(
-    kernels: List[Dict], output_dir: Path, codegen_dir: Path
-) -> bool:
-    """Generate Conv kernels for ALL declarations using unified codegen."""
-    if not kernels:
-        return False
-
+def _build_conv_codegen_cmd(
+    idx: int, k: Dict, codegen_dir: Path, output_dir: Path
+) -> Tuple[int, List[str], str]:
+    """Build the command for a single conv kernel codegen invocation."""
     variant_map = {
         "forward": "forward",
         "bwd_data": "bwd_data",
@@ -997,93 +1095,130 @@ def generate_conv_kernels(
         "bwd_weight": "bwd_weight",
         "backward_weight": "bwd_weight",
     }
+    variant = variant_map.get(k.get("conv_type", "forward"), "forward")
+
+    cmd = [
+        sys.executable,
+        str(codegen_dir / "unified_grouped_conv_codegen.py"),
+        "--datatype",
+        k.get("dtype", "fp16"),
+        "--variant",
+        variant,
+        "--ndim",
+        str(k.get("ndim", 2)),
+        "--output",
+        str(output_dir),
+    ]
+
+    if k.get("tile_m"):
+        cmd.extend(["--tile-m", str(k["tile_m"])])
+    if k.get("tile_n"):
+        cmd.extend(["--tile-n", str(k["tile_n"])])
+    if k.get("warp_m"):
+        cmd.extend(["--warp-m", str(k["warp_m"])])
+    if k.get("warp_n"):
+        cmd.extend(["--warp-n", str(k["warp_n"])])
+    if k.get("warp_k"):
+        cmd.extend(["--warp-k", str(k["warp_k"])])
+    if k.get("warp_tile_m"):
+        cmd.extend(["--warp-tile-m", str(k["warp_tile_m"])])
+    if k.get("warp_tile_n"):
+        cmd.extend(["--warp-tile-n", str(k["warp_tile_n"])])
+    if k.get("warp_tile_k"):
+        cmd.extend(["--warp-tile-k", str(k["warp_tile_k"])])
+    if k.get("pipeline"):
+        cmd.extend(["--pipeline", k["pipeline"]])
+    if k.get("scheduler"):
+        cmd.extend(["--scheduler", k["scheduler"]])
+    if k.get("epilogue"):
+        cmd.extend(["--epilogue", k["epilogue"]])
+    if k.get("vector_a"):
+        cmd.extend(["--vector-a", str(k["vector_a"])])
+    if k.get("vector_b"):
+        cmd.extend(["--vector-b", str(k["vector_b"])])
+    if k.get("vector_c"):
+        cmd.extend(["--vector-c", str(k["vector_c"])])
+    if k.get("block_per_cu"):
+        cmd.extend(["--block-per-cu", str(k["block_per_cu"])])
+    if k.get("num_wave_groups"):
+        cmd.extend(["--num-wave-groups", str(k["num_wave_groups"])])
+    if k.get("num_groups_to_merge"):
+        cmd.extend(["--num-groups-to-merge", str(k["num_groups_to_merge"])])
+    if k.get("double_smem_buffer") is not None:
+        cmd.extend(["--double-smem-buffer", str(k["double_smem_buffer"]).lower()])
+    if k.get("tile_k"):
+        cmd.extend(["--tile-k", str(k["tile_k"])])
+
+    return (idx, cmd, str(codegen_dir))
+
+
+def _run_conv_codegen(args: Tuple) -> Tuple[int, bool, str]:
+    """Run unified_grouped_conv_codegen.py for a single kernel config (picklable for ProcessPoolExecutor)."""
+    idx, cmd, cwd = args
+    result = subprocess.run(cmd, capture_output=True, text=True, cwd=cwd)
+    if result.returncode != 0:
+        return (idx, False, result.stderr[:300])
+    return (idx, True, "")
+
+
+def generate_conv_kernels(
+    kernels: List[Dict], output_dir: Path, codegen_dir: Path
+) -> bool:
+    """Generate Conv kernels for ALL declarations using unified codegen.
+
+    Launches all codegen subprocesses in parallel via ProcessPoolExecutor
+    for significantly faster generation when multiple conv kernels are declared.
+    """
+    if not kernels:
+        return False
+
+    work_items = [
+        _build_conv_codegen_cmd(idx, k, codegen_dir, output_dir)
+        for idx, k in enumerate(kernels)
+    ]
 
     success_count = 0
+    max_workers = min(len(work_items), os.cpu_count() or 4)
 
-    # Generate a kernel for EACH declaration
-    for idx, k in enumerate(kernels):
-        variant = variant_map.get(k.get("conv_type", "forward"), "forward")
-
-        cmd = [
-            sys.executable,
-            str(codegen_dir / "unified_conv_codegen.py"),
-            "--datatype",
-            k.get("dtype", "fp16"),
-            "--variant",
-            variant,
-            "--ndim",
-            str(k.get("ndim", 2)),
-            "--output",
-            str(output_dir),
-        ]
-
-        # Add optional parameters if specified
-        if k.get("tile_m"):
-            cmd.extend(["--tile-m", str(k["tile_m"])])
-        if k.get("tile_n"):
-            cmd.extend(["--tile-n", str(k["tile_n"])])
-        if k.get("warp_m"):
-            cmd.extend(["--warp-m", str(k["warp_m"])])
-        if k.get("warp_n"):
-            cmd.extend(["--warp-n", str(k["warp_n"])])
-        if k.get("warp_k"):
-            cmd.extend(["--warp-k", str(k["warp_k"])])
-        if k.get("warp_tile_m"):
-            cmd.extend(["--warp-tile-m", str(k["warp_tile_m"])])
-        if k.get("warp_tile_n"):
-            cmd.extend(["--warp-tile-n", str(k["warp_tile_n"])])
-        if k.get("warp_tile_k"):
-            cmd.extend(["--warp-tile-k", str(k["warp_tile_k"])])
-        if k.get("pipeline"):
-            cmd.extend(["--pipeline", k["pipeline"]])
-        if k.get("scheduler"):
-            cmd.extend(["--scheduler", k["scheduler"]])
-        if k.get("epilogue"):
-            cmd.extend(["--epilogue", k["epilogue"]])
-        if k.get("vector_a"):
-            cmd.extend(["--vector-a", str(k["vector_a"])])
-        if k.get("vector_b"):
-            cmd.extend(["--vector-b", str(k["vector_b"])])
-        if k.get("vector_c"):
-            cmd.extend(["--vector-c", str(k["vector_c"])])
-        if k.get("block_per_cu"):
-            cmd.extend(["--block-per-cu", str(k["block_per_cu"])])
-        if k.get("num_wave_groups"):
-            cmd.extend(["--num-wave-groups", str(k["num_wave_groups"])])
-        if k.get("num_groups_to_merge"):
-            cmd.extend(["--num-groups-to-merge", str(k["num_groups_to_merge"])])
-        if k.get("double_smem_buffer") is not None:
-            cmd.extend(["--double-smem-buffer", str(k["double_smem_buffer"]).lower()])
-        if k.get("tile_k"):
-            cmd.extend(["--tile-k", str(k["tile_k"])])
-
-        result = subprocess.run(
-            cmd, capture_output=True, text=True, cwd=str(codegen_dir)
-        )
-        if result.returncode != 0:
-            print(f"  Codegen error for kernel {idx + 1}: {result.stderr[:300]}")
-        else:
-            success_count += 1
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(_run_conv_codegen, w): w[0] for w in work_items}
+        for future in as_completed(futures):
+            idx, ok, err = future.result()
+            if ok:
+                success_count += 1
+            else:
+                print(f"  Codegen error for kernel {idx + 1}: {err}")
 
     return success_count > 0
 
 
+def _run_gemm_codegen(args: Tuple) -> Tuple[int, bool, str]:
+    """Run unified_gemm_codegen.py for a single kernel config (picklable for ProcessPoolExecutor)."""
+    idx, cmd, cwd = args
+    result = subprocess.run(cmd, capture_output=True, text=True, cwd=cwd)
+    if result.returncode != 0:
+        return (idx, False, result.stderr[:300])
+    return (idx, True, "")
+
+
 def generate_gemm_kernels(
     kernels: List[Dict], output_dir: Path, codegen_dir: Path
 ) -> bool:
-    """Generate GEMM kernels for ALL declarations using unified codegen."""
+    """Generate GEMM kernels for ALL declarations using unified codegen.
+
+    Launches all codegen subprocesses in parallel via ProcessPoolExecutor
+    for significantly faster generation when multiple kernels are declared.
+    """
     import json
 
     if not kernels:
         return False
 
-    success_count = 0
-
-    # Generate a kernel for EACH declaration
+    # Build all commands upfront
+    work_items = []
     for idx, k in enumerate(kernels):
         variant = "multi_d" if k.get("elementwise_op") else "standard"
 
-        # Build tile config JSON for this specific kernel
         tile_config = {
             "tile_m": [k.get("tile_m", 128)],
             "tile_n": [k.get("tile_n", 128)],
@@ -1125,13 +1260,20 @@ def generate_gemm_kernels(
             config_json,
         ]
 
-        result = subprocess.run(
-            cmd, capture_output=True, text=True, cwd=str(codegen_dir)
-        )
-        if result.returncode != 0:
-            print(f"  Codegen error for kernel {idx + 1}: {result.stderr[:300]}")
-        else:
-            success_count += 1
+        work_items.append((idx, cmd, str(codegen_dir)))
+
+    # Run all codegen subprocesses in parallel
+    success_count = 0
+    max_workers = min(len(work_items), os.cpu_count() or 4)
+
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(_run_gemm_codegen, w): w[0] for w in work_items}
+        for future in as_completed(futures):
+            idx, ok, err = future.result()
+            if ok:
+                success_count += 1
+            else:
+                print(f"  Codegen error for kernel {idx + 1}: {err}")
 
     return success_count > 0
 
@@ -1229,15 +1371,17 @@ def main():
     if example_type == "gemm":
         kernel_headers = list(args.output_dir.glob("gemm_*.hpp"))
     else:
-        k = kernels[0] if kernels else {}
-        variant = k.get("conv_type", "forward")
         prefix_map = {
-            "forward": "conv_fwd",
-            "bwd_data": "conv_bwdd",
-            "bwd_weight": "conv_bwdw",
+            "forward": "grouped_conv_fwd",
+            "bwd_data": "grouped_conv_bwd_data",
+            "bwd_weight": "grouped_conv_bwd_weight",
         }
-        prefix = prefix_map.get(variant, "conv_fwd")
-        kernel_headers = list(args.output_dir.glob(f"{prefix}_*.hpp"))
+        # Collect headers from ALL variants present in declarations
+        variants_used = set(k.get("conv_type", "forward") for k in kernels)
+        kernel_headers = []
+        for variant in variants_used:
+            prefix = prefix_map.get(variant, "grouped_conv_fwd")
+            kernel_headers.extend(args.output_dir.glob(f"{prefix}_*.hpp"))
 
     if not kernel_headers:
         print(f"[{target_name}] No kernel headers generated!")
@@ -1347,29 +1491,39 @@ def main():
                 )
 
         if has_bwd_data:
-            bwdd_kernel = find_kernel_by_dtype_type(kernel_headers, "fp16", "_bwdd_")
-            if bwdd_kernel:
-                bwdd_ns = f"ns_{bwdd_kernel.stem}"
-                launcher_aliases.append(
-                    f"using BwdDataKernelLauncher = {bwdd_ns}::{bwdd_kernel.stem}_Launcher;"
+            bwd_data_kernel = find_kernel_by_dtype_type(
+                kernel_headers, "fp16", "_bwd_data_"
+            )
+            if not bwd_data_kernel:
+                bwd_data_kernel = find_kernel_by_dtype_type(
+                    kernel_headers, "fp16", "_bwdd_"
                 )
-                if not has_fwd:  # If no fwd, use bwd_data as first
+            if bwd_data_kernel:
+                bwd_data_ns = f"ns_{bwd_data_kernel.stem}"
+                launcher_aliases.append(
+                    f"using BwdDataKernelLauncher = {bwd_data_ns}::{bwd_data_kernel.stem}_Launcher;"
+                )
+                if not has_fwd:
                     launcher_aliases.append(
-                        f"using FirstKernelLauncher = {bwdd_ns}::{bwdd_kernel.stem}_Launcher;"
+                        f"using FirstKernelLauncher = {bwd_data_ns}::{bwd_data_kernel.stem}_Launcher;"
                     )
 
         if has_bwd_weight:
-            bwdw_kernel = find_kernel_by_dtype_type(kernel_headers, "fp16", "_bwdw_")
-            if bwdw_kernel:
-                bwdw_ns = f"ns_{bwdw_kernel.stem}"
-                launcher_aliases.append(
-                    f"using BwdWeightKernelLauncher = {bwdw_ns}::{bwdw_kernel.stem}_Launcher;"
+            bwd_weight_kernel = find_kernel_by_dtype_type(
+                kernel_headers, "fp16", "_bwd_weight_"
+            )
+            if not bwd_weight_kernel:
+                bwd_weight_kernel = find_kernel_by_dtype_type(
+                    kernel_headers, "fp16", "_bwdw_"
                 )
-                if (
-                    not has_fwd and not has_bwd_data
-                ):  # If no fwd or bwdd, use bwdw as first
+            if bwd_weight_kernel:
+                bwd_weight_ns = f"ns_{bwd_weight_kernel.stem}"
+                launcher_aliases.append(
+                    f"using BwdWeightKernelLauncher = {bwd_weight_ns}::{bwd_weight_kernel.stem}_Launcher;"
+                )
+                if not has_fwd and not has_bwd_data:
                     launcher_aliases.append(
-                        f"using FirstKernelLauncher = {bwdw_ns}::{bwdw_kernel.stem}_Launcher;"
+                        f"using FirstKernelLauncher = {bwd_weight_ns}::{bwd_weight_kernel.stem}_Launcher;"
                     )
 
         launcher_section = "\n".join(launcher_aliases)
@@ -1382,14 +1536,16 @@ def main():
 #include "ck_tile/dispatcher/registry.hpp"
 #include "ck_tile/dispatcher/kernel_instance.hpp"
 #include "ck_tile/dispatcher/kernel_key.hpp"
+#include "ck_tile/dispatcher/grouped_conv_registry.hpp"
+#include "ck_tile/dispatcher/backends/generated_conv_backend.hpp"
 
 namespace generated {{
 
 // Kernel launchers for direct use
 {launcher_section}
 
-// Registration function
-inline void {func_name}(ck_tile::dispatcher::Registry& registry, const std::string& arch) {{
+// Registration function (takes GroupedConvRegistry for conv kernels)
+inline void {func_name}(ck_tile::dispatcher::GroupedConvRegistry& registry, const std::string& arch) {{
 {register_body}
 }}
 
@@ -1439,7 +1595,7 @@ inline void {func_name}(ck_tile::dispatcher::Registry& registry, const std::stri
 """
     header_path.write_text(header_content)
 
-    print(f"[{target_name}] ✓ {len(obj_files)} kernels compiled")
+    print(f"[{target_name}] OK {len(obj_files)} kernels compiled")
     return 0
 
 
diff --git a/dispatcher/scripts/generate_conv_dispatch_header.py b/dispatcher/scripts/generate_conv_dispatch_header.py
new file mode 100644
index 0000000000..55cc085ed9
--- /dev/null
+++ b/dispatcher/scripts/generate_conv_dispatch_header.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""Generate the conv_python_dispatch.hpp header for the Python conv library.
+
+Reads the include_all headers to find available kernels and creates dispatch
+aliases for 2D/3D x fwd/bwd_data/bwd_weight.
+"""
+
+import argparse
+import re
+from pathlib import Path
+
+
+def find_3d_launcher(include_all_path: Path, variant_prefix: str) -> str:
+    """Find first 3D launcher name from an include_all header."""
+    text = include_all_path.read_text()
+    pattern = rf"(grouped_conv_{variant_prefix}_\w+_3d_\w+)\.hpp"
+    match = re.search(pattern, text)
+    if match:
+        return match.group(1) + "_Launcher"
+    return ""
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--kernel-dir", required=True)
+    parser.add_argument("--output", required=True)
+    args = parser.parse_args()
+
+    kdir = Path(args.kernel_dir)
+
+    fwd_3d = find_3d_launcher(kdir / "include_all_grouped_conv_fwd_kernels.hpp", "fwd")
+    bwd_data_3d = find_3d_launcher(
+        kdir / "include_all_grouped_conv_bwd_data_kernels.hpp", "bwd_data"
+    )
+    bwd_weight_3d = find_3d_launcher(
+        kdir / "include_all_grouped_conv_bwd_weight_kernels.hpp", "bwd_weight"
+    )
+
+    lines = [
+        "// Auto-generated dispatch header for Python conv library",
+        "#pragma once",
+        "",
+        "// Forward kernels",
+        '#include "include_all_grouped_conv_fwd_kernels.hpp"',
+        "#define CONV_FWD_2D_AVAILABLE 1",
+    ]
+    if fwd_3d:
+        lines += [
+            "#define CONV_FWD_3D_AVAILABLE 1",
+            f"using ConvFwd3dLauncher = {fwd_3d};",
+        ]
+    lines += [
+        "",
+        "// Backward data kernels",
+        '#include "include_all_grouped_conv_bwd_data_kernels.hpp"',
+        "#define CONV_BWD_DATA_2D_AVAILABLE 1",
+    ]
+    if bwd_data_3d:
+        lines += [
+            "#define CONV_BWD_DATA_3D_AVAILABLE 1",
+            f"using ConvBwdData3dLauncher = {bwd_data_3d};",
+        ]
+    lines += [
+        "",
+        "// Backward weight kernels",
+        '#include "include_all_grouped_conv_bwd_weight_kernels.hpp"',
+        "#define CONV_BWD_WEIGHT_2D_AVAILABLE 1",
+    ]
+    if bwd_weight_3d:
+        lines += [
+            "#define CONV_BWD_WEIGHT_3D_AVAILABLE 1",
+            f"using ConvBwdWeight3dLauncher = {bwd_weight_3d};",
+        ]
+
+    # Kernel name table for Python introspection
+    names = []
+    if True:  # fwd 2D always present
+        names.append('"fwd_2d"')
+    if fwd_3d:
+        names.append('"fwd_3d"')
+    if True:  # bwd_data 2D
+        names.append('"bwd_data_2d"')
+    if bwd_data_3d:
+        names.append('"bwd_data_3d"')
+    if True:  # bwd_weight 2D
+        names.append('"bwd_weight_2d"')
+    if bwd_weight_3d:
+        names.append('"bwd_weight_3d"')
+
+    lines += [
+        "",
+        "// Kernel inventory for Python",
+        f"static const char* CONV_KERNEL_NAMES[] = {{{', '.join(names)}}};",
+        f"static const int CONV_KERNEL_COUNT = {len(names)};",
+        "",
+    ]
+
+    Path(args.output).write_text("\n".join(lines) + "\n")
+    print(f"Generated dispatch header: {args.output} ({len(names)} kernels)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/scripts/parallel_kernel_builder.py b/dispatcher/scripts/parallel_kernel_builder.py
index 911ea61bd7..aef8f4ff0b 100755
--- a/dispatcher/scripts/parallel_kernel_builder.py
+++ b/dispatcher/scripts/parallel_kernel_builder.py
@@ -132,7 +132,7 @@ def main():
         print(f"Linking failed: {result.stderr}")
         return 1
 
-    print(f"✓ Built: {lib_path}")
+    print(f"OK Built: {lib_path}")
     return 0
 
 
diff --git a/dispatcher/scripts/stress_test_autocorrect.py b/dispatcher/scripts/stress_test_autocorrect.py
index 13e92abffa..63b250071e 100644
--- a/dispatcher/scripts/stress_test_autocorrect.py
+++ b/dispatcher/scripts/stress_test_autocorrect.py
@@ -34,9 +34,9 @@ from compile_gemm_examples import (  # noqa: E402
     validate_kernel_config,
     expand_declaration_with_arch_filter,
 )
-from compile_conv_examples import (  # noqa: E402
-    validate_conv_kernel_config,
-    expand_conv_declaration_with_arch_filter,
+from compile_grouped_conv_examples import (  # noqa: E402
+    validate_grouped_conv_kernel_config as validate_conv_kernel_config,
+    expand_grouped_conv_declaration_with_arch_filter as expand_conv_declaration_with_arch_filter,
 )
 
 
@@ -316,7 +316,7 @@ def test_python_autocorrect(verbose=False):
                 if was_modified:
                     print(f"    Modified: {len(corrections)} correction(s)")
                     for c in corrections:
-                        print(f"      • {c}")
+                        print(f"      - {c}")
 
         except Exception as e:
             results["failed"] += 1
@@ -465,7 +465,7 @@ def run_stress_test(arch, num_samples, verbose):
             }
 
             expanded = expand_declaration_with_arch_filter(config, test_arch)
-            status = "✓" if expanded else "✗"
+            status = "OK" if expanded else "FAIL"
             expected = test_arch in test["expected_archs"]
             match = "OK" if (bool(expanded) == expected) else "MISMATCH"
 
diff --git a/dispatcher/src/dispatcher.cpp b/dispatcher/src/dispatcher.cpp
index fdb400921e..2cb589adf2 100644
--- a/dispatcher/src/dispatcher.cpp
+++ b/dispatcher/src/dispatcher.cpp
@@ -2,17 +2,18 @@
 // SPDX-License-Identifier: MIT
 
 #include "ck_tile/dispatcher/dispatcher.hpp"
-#include <stdexcept>
+#include "ck_tile/dispatcher/dispatcher_error.hpp"
 #include <sstream>
 #include <iostream>
 
 namespace ck_tile {
 namespace dispatcher {
 
-Dispatcher::Dispatcher(Registry* registry)
+Dispatcher::Dispatcher(Registry* registry, const std::string& gfx_arch)
     : registry_(registry ? registry : &Registry::instance()),
       heuristic_(nullptr),
-      strategy_(SelectionStrategy::FirstFit)
+      strategy_(SelectionStrategy::FirstFit),
+      gfx_arch_(gfx_arch)
 {
 }
 
@@ -61,7 +62,7 @@ float Dispatcher::run_fused(const void* a_ptr,
         std::ostringstream oss;
         oss << "No suitable kernel found for problem: M=" << problem.M << " N=" << problem.N
             << " K=" << problem.K;
-        throw std::runtime_error(oss.str());
+        throw NoKernelFound(oss.str());
     }
 
     return kernel->run(a_ptr, b_ptr, c_ptr, d_ptrs, problem, stream);
@@ -78,7 +79,7 @@ float Dispatcher::run_explicit(const std::string& kernel_id,
     auto kernel = registry_->lookup(kernel_id);
     if(!kernel)
     {
-        throw std::runtime_error("Kernel not found: " + kernel_id);
+        throw NoKernelFound("Kernel not found: " + kernel_id);
     }
 
     if(!kernel->supports(problem))
@@ -86,7 +87,7 @@ float Dispatcher::run_explicit(const std::string& kernel_id,
         std::ostringstream oss;
         oss << "Kernel " << kernel_id << " does not support problem: M=" << problem.M
             << " N=" << problem.N << " K=" << problem.K;
-        throw std::runtime_error(oss.str());
+        throw UnsupportedProblem(oss.str());
     }
 
     return kernel->run(a_ptr, b_ptr, c_ptr, d_ptrs, problem, stream);
diff --git a/dispatcher/src/registry.cpp b/dispatcher/src/registry.cpp
index 0d83afd613..f565885181 100644
--- a/dispatcher/src/registry.cpp
+++ b/dispatcher/src/registry.cpp
@@ -5,39 +5,32 @@
 #include "ck_tile/dispatcher/json_export.hpp"
 #include "ck_tile/dispatcher/arch_filter.hpp"
 #include <algorithm>
+#include <fstream>
+#include <iostream>
 
 namespace ck_tile {
 namespace dispatcher {
 
-Registry::Registry()
-    : name_("default"),
-      auto_export_enabled_(false),
-      auto_export_include_statistics_(true),
-      auto_export_on_every_registration_(true)
-{
-}
+Registry::Registry() = default;
 
 Registry::~Registry()
 {
-    // Perform auto-export on destruction if enabled (regardless of export_on_every_registration
-    // setting)
     if(auto_export_enabled_)
     {
         perform_auto_export();
     }
 }
 
-Registry::Registry(Registry&& other) noexcept
-    : mutex_() // mutex is not movable, create new one
-      ,
-      kernels_(std::move(other.kernels_)),
-      name_(std::move(other.name_)),
-      auto_export_enabled_(other.auto_export_enabled_),
-      auto_export_filename_(std::move(other.auto_export_filename_)),
-      auto_export_include_statistics_(other.auto_export_include_statistics_),
-      auto_export_on_every_registration_(other.auto_export_on_every_registration_)
+Registry::Registry(Registry&& other) noexcept : Base(std::move(other))
 {
-    // Disable auto-export on the moved-from object to prevent double export
+    // Base move constructor already locked+released other.mutex_.
+    // Re-acquire to safely read the remaining fields.
+    std::lock_guard<std::mutex> lock(other.mutex());
+    auto_export_enabled_               = other.auto_export_enabled_;
+    auto_export_filename_              = std::move(other.auto_export_filename_);
+    auto_export_include_statistics_    = other.auto_export_include_statistics_;
+    auto_export_on_every_registration_ = other.auto_export_on_every_registration_;
+
     other.auto_export_enabled_ = false;
 }
 
@@ -45,11 +38,7 @@ Registry& Registry::operator=(Registry&& other) noexcept
 {
     if(this != &other)
     {
-        std::lock_guard<std::mutex> lock(mutex_);
-        std::lock_guard<std::mutex> other_lock(other.mutex_);
-
-        kernels_                           = std::move(other.kernels_);
-        name_                              = std::move(other.name_);
+        Base::operator=(std::move(other));
         auto_export_enabled_               = other.auto_export_enabled_;
         auto_export_filename_              = std::move(other.auto_export_filename_);
         auto_export_include_statistics_    = other.auto_export_include_statistics_;
@@ -64,55 +53,27 @@ Registry& Registry::operator=(Registry&& other) noexcept
 bool Registry::register_kernel(KernelInstancePtr instance, Priority priority)
 {
     if(!instance)
-    {
         return false;
-    }
 
-    const std::string identifier = instance->get_key().encode_identifier();
-
-    bool registered = false;
+    if(Base::register_kernel(instance->get_name(), instance, priority))
     {
-        std::lock_guard<std::mutex> lock(mutex_);
-
-        auto it = kernels_.find(identifier);
-        if(it != kernels_.end())
+        if(auto_export_enabled_ && auto_export_on_every_registration_)
         {
-            // Kernel with this identifier already exists
-            // Only replace if new priority is higher
-            if(priority > it->second.priority)
-            {
-                it->second.instance = instance;
-                it->second.priority = priority;
-                registered          = true;
-            }
-        }
-        else
-        {
-            // New kernel, insert it
-            kernels_[identifier] = RegistryEntry{instance, priority};
-            registered           = true;
+            perform_auto_export();
         }
+        return true;
     }
-
-    // Perform auto-export if enabled and configured to export on every registration
-    if(registered && auto_export_enabled_ && auto_export_on_every_registration_)
-    {
-        perform_auto_export();
-    }
-
-    return registered;
+    return false;
 }
 
 KernelInstancePtr Registry::lookup(const std::string& identifier) const
 {
-    std::lock_guard<std::mutex> lock(mutex_);
-
-    auto it = kernels_.find(identifier);
-    if(it != kernels_.end())
+    std::lock_guard<std::mutex> lock(mutex());
+    auto it = entries().find(identifier);
+    if(it != entries().end())
     {
         return it->second.instance;
     }
-
     return nullptr;
 }
 
@@ -121,75 +82,23 @@ KernelInstancePtr Registry::lookup(const KernelKey& key) const
     return lookup(key.encode_identifier());
 }
 
-std::vector<KernelInstancePtr> Registry::get_all() const
-{
-    std::lock_guard<std::mutex> lock(mutex_);
-
-    std::vector<KernelInstancePtr> result;
-    result.reserve(kernels_.size());
-
-    for(const auto& pair : kernels_)
-    {
-        result.push_back(pair.second.instance);
-    }
-
-    return result;
-}
+std::vector<KernelInstancePtr> Registry::get_all() const { return Base::get_all_instances(); }
 
 std::vector<KernelInstancePtr>
 Registry::filter(std::function<bool(const KernelInstance&)> predicate) const
 {
-    std::lock_guard<std::mutex> lock(mutex_);
-
+    std::lock_guard<std::mutex> lock(mutex());
     std::vector<KernelInstancePtr> result;
-
-    for(const auto& pair : kernels_)
+    for(const auto& [name, entry] : entries())
     {
-        if(predicate(*pair.second.instance))
+        if(predicate(*(entry.instance)))
         {
-            result.push_back(pair.second.instance);
+            result.push_back(entry.instance);
         }
     }
-
     return result;
 }
 
-std::size_t Registry::size() const
-{
-    std::lock_guard<std::mutex> lock(mutex_);
-    return kernels_.size();
-}
-
-bool Registry::empty() const
-{
-    std::lock_guard<std::mutex> lock(mutex_);
-    return kernels_.empty();
-}
-
-void Registry::clear()
-{
-    std::lock_guard<std::mutex> lock(mutex_);
-    kernels_.clear();
-}
-
-const std::string& Registry::get_name() const
-{
-    std::lock_guard<std::mutex> lock(mutex_);
-    return name_;
-}
-
-void Registry::set_name(const std::string& name)
-{
-    std::lock_guard<std::mutex> lock(mutex_);
-    name_ = name;
-}
-
-Registry& Registry::instance()
-{
-    static Registry global_registry;
-    return global_registry;
-}
-
 std::string Registry::export_json(bool include_statistics) const
 {
     return export_registry_json(*this, include_statistics);
@@ -204,7 +113,7 @@ void Registry::enable_auto_export(const std::string& filename,
                                   bool include_statistics,
                                   bool export_on_every_registration)
 {
-    std::lock_guard<std::mutex> lock(mutex_);
+    std::lock_guard<std::mutex> lock(mutex());
     auto_export_enabled_               = true;
     auto_export_filename_              = filename;
     auto_export_include_statistics_    = include_statistics;
@@ -213,13 +122,13 @@ void Registry::enable_auto_export(const std::string& filename,
 
 void Registry::disable_auto_export()
 {
-    std::lock_guard<std::mutex> lock(mutex_);
+    std::lock_guard<std::mutex> lock(mutex());
     auto_export_enabled_ = false;
 }
 
 bool Registry::is_auto_export_enabled() const
 {
-    std::lock_guard<std::mutex> lock(mutex_);
+    std::lock_guard<std::mutex> lock(mutex());
     return auto_export_enabled_;
 }
 
@@ -230,7 +139,7 @@ void Registry::perform_auto_export()
     bool include_stats;
 
     {
-        std::lock_guard<std::mutex> lock(mutex_);
+        std::lock_guard<std::mutex> lock(mutex());
         if(!auto_export_enabled_)
         {
             return;
@@ -243,31 +152,15 @@ void Registry::perform_auto_export()
     export_json_to_file(filename, include_stats);
 }
 
-std::size_t Registry::merge_from(const Registry& other, Priority priority)
-{
-    auto other_kernels       = other.get_all();
-    std::size_t merged_count = 0;
-
-    for(const auto& kernel : other_kernels)
-    {
-        if(register_kernel(kernel, priority))
-        {
-            merged_count++;
-        }
-    }
-
-    return merged_count;
-}
-
 std::size_t Registry::filter_by_arch(const std::string& gpu_arch)
 {
     ArchFilter filter(gpu_arch);
     std::vector<std::string> to_remove;
 
     {
-        std::lock_guard<std::mutex> lock(mutex_);
+        std::lock_guard<std::mutex> lock(mutex());
 
-        for(const auto& pair : kernels_)
+        for(const auto& pair : entries())
         {
             if(!filter.is_valid(pair.second.instance->get_key()))
             {
@@ -277,12 +170,18 @@ std::size_t Registry::filter_by_arch(const std::string& gpu_arch)
 
         for(const auto& key : to_remove)
         {
-            kernels_.erase(key);
+            entries_mut().erase(key);
         }
     }
 
     return to_remove.size();
 }
 
+Registry& Registry::instance()
+{
+    static Registry global_registry;
+    return global_registry;
+}
+
 } // namespace dispatcher
-} // namespace ck_tile
+} // namespace ck_tile
\ No newline at end of file
diff --git a/dispatcher/tests/CMakeLists.txt b/dispatcher/tests/CMakeLists.txt
index 6c20c18c95..a54feba284 100644
--- a/dispatcher/tests/CMakeLists.txt
+++ b/dispatcher/tests/CMakeLists.txt
@@ -217,6 +217,10 @@ endforeach()
 # Standalone integration tests (with their own main())
 set(STANDALONE_TESTS
     test_minimal.cpp
+    test_grouped_conv_config.cpp
+    test_grouped_conv_problem.cpp
+    test_grouped_conv_kernel_decl.cpp
+    test_grouped_conv_registry.cpp
 )
 
 foreach(test_source ${STANDALONE_TESTS})
diff --git a/dispatcher/tests/test_autocorrect.py b/dispatcher/tests/test_autocorrect.py
index 0ec3ebda3c..3f52049f74 100644
--- a/dispatcher/tests/test_autocorrect.py
+++ b/dispatcher/tests/test_autocorrect.py
@@ -42,10 +42,10 @@ from compile_gemm_examples import (  # noqa: E402
     expand_declaration_with_arch_filter,
     is_wildcard_declaration,
 )
-from compile_conv_examples import (  # noqa: E402
-    validate_conv_kernel_config,
-    expand_conv_declaration_with_arch_filter,
-    is_conv_wildcard_declaration,
+from compile_grouped_conv_examples import (  # noqa: E402
+    validate_grouped_conv_kernel_config as validate_conv_kernel_config,
+    expand_grouped_conv_declaration_with_arch_filter as expand_conv_declaration_with_arch_filter,
+    is_grouped_conv_wildcard_declaration as is_conv_wildcard_declaration,
 )
 from ctypes_utils import auto_correct_kernel_config, KernelConfig  # noqa: E402
 
diff --git a/dispatcher/tests/test_codegen_common.py b/dispatcher/tests/test_codegen_common.py
new file mode 100644
index 0000000000..2efeaefb4d
--- /dev/null
+++ b/dispatcher/tests/test_codegen_common.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for codegen/codegen_common.py -- shared infrastructure for GEMM and grouped conv codegen.
+
+Phase 1a TDD: these tests are written BEFORE the implementation exists.
+Run: python3 -m pytest tests/test_codegen_common.py -v
+"""
+
+import sys
+import unittest
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DISPATCHER_DIR = SCRIPT_DIR.parent
+sys.path.insert(0, str(DISPATCHER_DIR / "codegen"))
+
+from codegen_common import (  # noqa: E402
+    TileConfig,
+    TraitConfigBase,
+    CommonTypeMappings,
+    generate_cpp_compilation_unit,
+    parallel_generate,
+    valid_wave_configs,
+    valid_warp_configs,
+    valid_trait_configs,
+    needs_wave_expansion,
+    needs_warp_expansion,
+    needs_pipeline_expansion,
+)
+
+
+class TestTileConfig(unittest.TestCase):
+    """TileConfig dataclass tests."""
+
+    def test_valid_config(self):
+        tc = TileConfig(128, 128, 32, 2, 2, 1, 32, 32, 16)
+        self.assertTrue(tc.is_valid())
+
+    def test_zero_tile_invalid(self):
+        tc = TileConfig(0, 128, 32, 2, 2, 1, 32, 32, 16)
+        self.assertFalse(tc.is_valid())
+
+    def test_non_divisible_invalid(self):
+        tc = TileConfig(127, 128, 32, 2, 2, 1, 32, 32, 16)
+        self.assertFalse(tc.is_valid())
+
+    def test_all_fields_accessible(self):
+        tc = TileConfig(256, 128, 64, 4, 1, 1, 32, 32, 16)
+        self.assertEqual(tc.tile_m, 256)
+        self.assertEqual(tc.tile_n, 128)
+        self.assertEqual(tc.tile_k, 64)
+        self.assertEqual(tc.warp_m, 4)
+        self.assertEqual(tc.warp_n, 1)
+        self.assertEqual(tc.warp_k, 1)
+        self.assertEqual(tc.warp_tile_m, 32)
+        self.assertEqual(tc.warp_tile_n, 32)
+        self.assertEqual(tc.warp_tile_k, 16)
+
+    def test_small_valid_config(self):
+        tc = TileConfig(16, 16, 16, 1, 1, 1, 16, 16, 16)
+        self.assertTrue(tc.is_valid())
+
+
+class TestTraitConfigBase(unittest.TestCase):
+    """TraitConfigBase dataclass tests."""
+
+    def test_valid_intrawave(self):
+        tc = TraitConfigBase("compv3", "cshuffle", "intrawave", False, False, False)
+        self.assertTrue(tc.is_valid())
+
+    def test_invalid_interwave_compv3(self):
+        tc = TraitConfigBase("compv3", "cshuffle", "interwave", False, False, False)
+        self.assertFalse(tc.is_valid())
+
+    def test_invalid_interwave_compv4(self):
+        tc = TraitConfigBase("compv4", "cshuffle", "interwave", False, False, False)
+        self.assertFalse(tc.is_valid())
+
+    def test_valid_mem_interwave(self):
+        tc = TraitConfigBase("mem", "cshuffle", "interwave", False, False, False)
+        self.assertTrue(tc.is_valid())
+
+    def test_valid_mem_intrawave(self):
+        tc = TraitConfigBase("mem", "cshuffle", "intrawave", False, False, False)
+        self.assertTrue(tc.is_valid())
+
+    def test_padding_fields(self):
+        tc = TraitConfigBase("compv3", "cshuffle", "intrawave", True, True, True)
+        self.assertTrue(tc.pad_m)
+        self.assertTrue(tc.pad_n)
+        self.assertTrue(tc.pad_k)
+
+
+class TestCommonTypeMappings(unittest.TestCase):
+    """CommonTypeMappings tests."""
+
+    def test_dtype_to_ck(self):
+        self.assertEqual(CommonTypeMappings.DTYPE_TO_CK["fp16"], "fp16_t")
+        self.assertEqual(CommonTypeMappings.DTYPE_TO_CK["bf16"], "bf16_t")
+        self.assertEqual(CommonTypeMappings.DTYPE_TO_CK["fp32"], "float")
+        self.assertEqual(CommonTypeMappings.DTYPE_TO_CK["fp8"], "fp8_t")
+
+    def test_pipeline_to_ck(self):
+        self.assertEqual(
+            CommonTypeMappings.PIPELINE_TO_CK["mem"], "GemmPipelineAgBgCrMem"
+        )
+        self.assertIn("compv3", CommonTypeMappings.PIPELINE_TO_CK)
+        self.assertIn("compv4", CommonTypeMappings.PIPELINE_TO_CK)
+
+    def test_pipeline_to_base(self):
+        self.assertIn("mem", CommonTypeMappings.PIPELINE_TO_BASE)
+        self.assertIn("compv3", CommonTypeMappings.PIPELINE_TO_BASE)
+        self.assertIn("compv4", CommonTypeMappings.PIPELINE_TO_BASE)
+
+    def test_scheduler_to_ck(self):
+        self.assertIn("intrawave", CommonTypeMappings.SCHEDULER_TO_CK)
+        self.assertIn("interwave", CommonTypeMappings.SCHEDULER_TO_CK)
+
+    def test_epilogue_to_dispatcher(self):
+        self.assertIn("cshuffle", CommonTypeMappings.EPILOGUE_TO_DISPATCHER)
+        self.assertIn("default", CommonTypeMappings.EPILOGUE_TO_DISPATCHER)
+
+    def test_layout_to_ck(self):
+        self.assertIn("r", CommonTypeMappings.LAYOUT_TO_CK)
+        self.assertIn("c", CommonTypeMappings.LAYOUT_TO_CK)
+
+    def test_get_output_dtype(self):
+        self.assertEqual(CommonTypeMappings.get_output_dtype("fp8"), "fp16")
+        self.assertEqual(CommonTypeMappings.get_output_dtype("bf8"), "fp16")
+        self.assertEqual(CommonTypeMappings.get_output_dtype("fp16"), "fp16")
+        self.assertEqual(CommonTypeMappings.get_output_dtype("fp32"), "fp32")
+
+
+class TestGenerateCppCompilationUnit(unittest.TestCase):
+    """Tests for generate_cpp_compilation_unit."""
+
+    def test_includes_kernel_header(self):
+        result = generate_cpp_compilation_unit("my_kernel")
+        self.assertIn('#include "my_kernel.hpp"', result)
+
+    def test_contains_pragma_once_or_guard(self):
+        result = generate_cpp_compilation_unit("test_kernel")
+        self.assertIn("test_kernel", result)
+
+    def test_different_names_different_output(self):
+        a = generate_cpp_compilation_unit("kernel_a")
+        b = generate_cpp_compilation_unit("kernel_b")
+        self.assertNotEqual(a, b)
+
+
+class TestParallelGenerate(unittest.TestCase):
+    """Tests for parallel_generate helper."""
+
+    def _dummy_generate(self, item):
+        return f"generated_{item}"
+
+    def test_parallel_returns_all(self):
+        items = ["a", "b", "c", "d"]
+        results = parallel_generate(self._dummy_generate, items, parallel=True)
+        self.assertEqual(len(results), 4)
+        for item in items:
+            self.assertIn(f"generated_{item}", results)
+
+    def test_sequential_returns_all(self):
+        items = ["x", "y", "z"]
+        results = parallel_generate(self._dummy_generate, items, parallel=False)
+        self.assertEqual(len(results), 3)
+        for item in items:
+            self.assertIn(f"generated_{item}", results)
+
+    def test_empty_items(self):
+        results = parallel_generate(self._dummy_generate, [], parallel=True)
+        self.assertEqual(len(results), 0)
+
+    def test_logs_per_kernel_progress(self):
+        items = ["k1", "k2"]
+        with self.assertLogs(level="INFO") as cm:
+            parallel_generate(self._dummy_generate, items, parallel=False)
+        log_output = "\n".join(cm.output)
+        self.assertIn("k1", log_output)
+        self.assertIn("k2", log_output)
+
+
+class TestArchAwareExpansion(unittest.TestCase):
+    """Tests for arch-aware expansion helpers (best-of-conv)."""
+
+    def test_valid_wave_configs_gfx942(self):
+        configs = valid_wave_configs("gfx942")
+        self.assertIsInstance(configs, list)
+        self.assertIn([2, 2, 1], configs)
+        self.assertIn([1, 4, 1], configs)
+
+    def test_valid_wave_configs_unknown_arch(self):
+        configs = valid_wave_configs("gfx_unknown")
+        self.assertIsInstance(configs, list)
+        self.assertGreater(len(configs), 0)
+
+    def test_valid_warp_configs_gfx942_fp16(self):
+        configs = valid_warp_configs("gfx942", "fp16")
+        self.assertIsInstance(configs, list)
+        self.assertIn([32, 32, 16], configs)
+
+    def test_valid_warp_configs_unknown_arch(self):
+        configs = valid_warp_configs("gfx_unknown", "fp16")
+        self.assertIsInstance(configs, list)
+        self.assertGreater(len(configs), 0)
+
+    def test_valid_trait_configs_excludes_interwave_compute(self):
+        configs = valid_trait_configs()
+        self.assertIsInstance(configs, list)
+        self.assertNotIn(("compv3", "cshuffle", "interwave"), configs)
+        self.assertNotIn(("compv4", "cshuffle", "interwave"), configs)
+
+    def test_valid_trait_configs_includes_mem_interwave(self):
+        configs = valid_trait_configs()
+        has_mem_interwave = any(p == "mem" and s == "interwave" for p, s in configs)
+        self.assertTrue(has_mem_interwave)
+
+    def test_needs_wave_expansion_wildcard(self):
+        self.assertTrue(needs_wave_expansion({"wave_m": -1, "wave_n": 2}))
+        self.assertTrue(needs_wave_expansion({"wave_m": 2, "wave_n": -1}))
+
+    def test_needs_wave_expansion_explicit(self):
+        self.assertFalse(needs_wave_expansion({"wave_m": 2, "wave_n": 2}))
+
+    def test_needs_warp_expansion_wildcard(self):
+        self.assertTrue(needs_warp_expansion({"warp_m": -1, "warp_n": 32}))
+
+    def test_needs_warp_expansion_explicit(self):
+        self.assertFalse(needs_warp_expansion({"warp_m": 32, "warp_n": 32}))
+
+    def test_needs_pipeline_expansion_wildcard(self):
+        self.assertTrue(needs_pipeline_expansion({"pipeline": "*"}))
+
+    def test_needs_pipeline_expansion_explicit(self):
+        self.assertFalse(needs_pipeline_expansion({"pipeline": "compv4"}))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/dispatcher/tests/test_dispatcher_common.py b/dispatcher/tests/test_dispatcher_common.py
new file mode 100644
index 0000000000..2c0fc8307c
--- /dev/null
+++ b/dispatcher/tests/test_dispatcher_common.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for python/dispatcher_common.py -- shared Python dispatcher utilities.
+
+Phase 1b TDD: tests written BEFORE implementation exists.
+Run: python3 -m pytest tests/test_dispatcher_common.py -v
+"""
+
+import io
+import sys
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DISPATCHER_DIR = SCRIPT_DIR.parent
+sys.path.insert(0, str(DISPATCHER_DIR / "python"))
+sys.path.insert(0, str(DISPATCHER_DIR / "codegen"))
+
+from dispatcher_common import (  # noqa: E402
+    get_dispatcher_root,
+    get_ck_root,
+    get_build_dir,
+    get_generated_kernels_dir,
+    get_arch_filter_data,
+    ValidationResultBase,
+    validate_wave_config,
+    validate_warp_tile_config,
+    validate_trait_combo,
+    auto_correct_wave,
+    auto_correct_trait,
+    Colors,
+    print_phase,
+    print_success,
+    print_error,
+    print_info,
+    cleanup_generated_kernels,
+)
+
+
+class TestPathHelpers(unittest.TestCase):
+    """Tests for path helper functions."""
+
+    def test_dispatcher_root_contains_codegen(self):
+        root = get_dispatcher_root()
+        self.assertTrue((root / "codegen").exists())
+
+    def test_ck_root_contains_include_or_is_parent(self):
+        root = get_ck_root()
+        self.assertTrue(root.exists())
+        self.assertEqual(root, get_dispatcher_root().parent)
+
+    def test_build_dir_is_under_dispatcher(self):
+        build = get_build_dir()
+        self.assertEqual(build.parent, get_dispatcher_root())
+
+    def test_generated_kernels_dir_under_build(self):
+        gen_dir = get_generated_kernels_dir()
+        self.assertEqual(gen_dir.parent, get_build_dir())
+
+
+class TestGetArchFilterData(unittest.TestCase):
+    """Tests for get_arch_filter_data."""
+
+    def test_returns_dict(self):
+        data = get_arch_filter_data()
+        self.assertIsInstance(data, dict)
+
+    def test_has_warp_combos(self):
+        data = get_arch_filter_data()
+        self.assertIn("warp_combos", data)
+
+    def test_has_warp_tile_combos(self):
+        data = get_arch_filter_data()
+        self.assertIn("warp_tile_combos", data)
+
+    def test_has_trait_unsupported(self):
+        data = get_arch_filter_data()
+        self.assertIn("trait_unsupported", data)
+
+    def test_has_supported_archs(self):
+        data = get_arch_filter_data()
+        self.assertIn("supported_archs", data)
+        self.assertIn("gfx942", data["supported_archs"])
+
+    def test_gfx942_wave_configs(self):
+        data = get_arch_filter_data()
+        gfx942 = data["warp_combos"].get("gfx942", [])
+        self.assertIn([2, 2, 1], gfx942)
+
+
+class TestValidationResultBase(unittest.TestCase):
+    """Tests for ValidationResultBase dataclass."""
+
+    def test_valid_result(self):
+        vr = ValidationResultBase(is_valid=True)
+        self.assertTrue(vr.is_valid)
+        self.assertEqual(vr.errors, [])
+        self.assertEqual(vr.warnings, [])
+        self.assertEqual(vr.suggested_fixes, {})
+
+    def test_invalid_result(self):
+        vr = ValidationResultBase(
+            is_valid=False,
+            errors=["bad wave"],
+            suggested_fixes={"wave_m": 2},
+        )
+        self.assertFalse(vr.is_valid)
+        self.assertEqual(len(vr.errors), 1)
+        self.assertIn("wave_m", vr.suggested_fixes)
+
+
+class TestValidateWaveConfig(unittest.TestCase):
+    """Tests for validate_wave_config."""
+
+    def test_valid_wave(self):
+        is_valid, msg = validate_wave_config([2, 2, 1], "gfx942")
+        self.assertTrue(is_valid)
+        self.assertEqual(msg, "")
+
+    def test_invalid_wave(self):
+        is_valid, msg = validate_wave_config([3, 3, 1], "gfx942")
+        self.assertFalse(is_valid)
+        self.assertIn("wave", msg.lower())
+
+
+class TestValidateWarpTileConfig(unittest.TestCase):
+    """Tests for validate_warp_tile_config."""
+
+    def test_valid_warp_tile(self):
+        is_valid, msg = validate_warp_tile_config([32, 32, 16], "gfx942", "fp16")
+        self.assertTrue(is_valid)
+
+    def test_invalid_warp_tile(self):
+        is_valid, msg = validate_warp_tile_config([99, 99, 99], "gfx942", "fp16")
+        self.assertFalse(is_valid)
+        self.assertIn("warp", msg.lower())
+
+
+class TestValidateTraitCombo(unittest.TestCase):
+    """Tests for validate_trait_combo."""
+
+    def test_valid_trait(self):
+        is_valid, msg = validate_trait_combo("compv3", "cshuffle", "intrawave")
+        self.assertTrue(is_valid)
+
+    def test_invalid_trait_interwave_compute(self):
+        is_valid, msg = validate_trait_combo("compv4", "cshuffle", "interwave")
+        self.assertFalse(is_valid)
+
+    def test_valid_mem_interwave(self):
+        is_valid, msg = validate_trait_combo("mem", "cshuffle", "interwave")
+        self.assertTrue(is_valid)
+
+
+class TestAutoCorrectWave(unittest.TestCase):
+    """Tests for auto_correct_wave."""
+
+    def test_corrects_invalid_wave(self):
+        corrected = auto_correct_wave([1, 1, 1], "gfx942")
+        self.assertIsInstance(corrected, list)
+        self.assertEqual(len(corrected), 3)
+        data = get_arch_filter_data()
+        valid_waves = data["warp_combos"].get("gfx942", [[2, 2, 1]])
+        self.assertIn(corrected, valid_waves)
+
+
+class TestAutoCorrectTrait(unittest.TestCase):
+    """Tests for auto_correct_trait."""
+
+    def test_corrects_invalid_scheduler(self):
+        corrected_pipeline, corrected_scheduler = auto_correct_trait(
+            "compv4", "interwave"
+        )
+        self.assertEqual(corrected_scheduler, "intrawave")
+
+
+class TestColors(unittest.TestCase):
+    """Tests for Colors class (cross-platform ANSI support from conv)."""
+
+    def test_green_returns_string(self):
+        result = Colors.green("ok")
+        self.assertIn("ok", result)
+
+    def test_red_returns_string(self):
+        result = Colors.red("error")
+        self.assertIn("error", result)
+
+    def test_yellow_returns_string(self):
+        result = Colors.yellow("warn")
+        self.assertIn("warn", result)
+
+    def test_bold_returns_string(self):
+        result = Colors.bold("title")
+        self.assertIn("title", result)
+
+    def test_plain_mode_no_ansi(self):
+        with patch.object(Colors, "_use_color", return_value=False):
+            result = Colors.green("plain")
+            self.assertEqual(result, "plain")
+
+
+class TestPhasedOutput(unittest.TestCase):
+    """Tests for phased output helpers."""
+
+    def test_print_phase(self):
+        buf = io.StringIO()
+        with patch("sys.stdout", buf):
+            print_phase(1, "Setup")
+        self.assertIn("Setup", buf.getvalue())
+
+    def test_print_success(self):
+        buf = io.StringIO()
+        with patch("sys.stdout", buf):
+            print_success("Done")
+        self.assertIn("Done", buf.getvalue())
+
+    def test_print_error(self):
+        buf = io.StringIO()
+        with patch("sys.stdout", buf):
+            print_error("Oops")
+        self.assertIn("Oops", buf.getvalue())
+
+    def test_print_info(self):
+        buf = io.StringIO()
+        with patch("sys.stdout", buf):
+            print_info("FYI")
+        self.assertIn("FYI", buf.getvalue())
+
+
+class TestCleanup(unittest.TestCase):
+    """Tests for cleanup_generated_kernels."""
+
+    def test_cleanup_nonexistent_dir_no_error(self):
+        cleanup_generated_kernels(Path("/tmp/nonexistent_ck_test_dir_12345"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/dispatcher/tests/test_examples_integration.py b/dispatcher/tests/test_examples_integration.py
index cfd18a3305..d02ea69787 100644
--- a/dispatcher/tests/test_examples_integration.py
+++ b/dispatcher/tests/test_examples_integration.py
@@ -28,14 +28,18 @@ sys.path.insert(0, str(PYTHON_DIR))
 
 
 def run_python_example(
-    example_path: Path, timeout: int = 120
+    example_path: Path, timeout: int = 120, extra_args: list = None
 ) -> subprocess.CompletedProcess:
     """Run a Python example and capture output."""
     env = os.environ.copy()
     env["PYTHONPATH"] = str(PYTHON_DIR)
 
+    cmd = [sys.executable, str(example_path)]
+    if extra_args:
+        cmd.extend(extra_args)
+
     return subprocess.run(
-        [sys.executable, str(example_path)],
+        cmd,
         capture_output=True,
         text=True,
         timeout=timeout,
@@ -111,61 +115,74 @@ class TestGemmPythonExamples(unittest.TestCase):
         result = run_python_example(example)
 
         self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
-        # Should pass validation
         self.assertIn("PASS", result.stdout.upper(), "Validation should pass")
 
 
 class TestConvPythonExamples(unittest.TestCase):
-    """Test Conv Python examples."""
+    """Test grouped conv Python examples."""
 
     @classmethod
     def setUpClass(cls):
         """Check if examples directory exists."""
-        cls.conv_examples_dir = EXAMPLES_DIR / "conv" / "python"
+        cls.conv_examples_dir = EXAMPLES_DIR / "grouped_conv" / "python"
         if not cls.conv_examples_dir.exists():
-            raise unittest.SkipTest("Conv Python examples not found")
+            raise unittest.SkipTest("Grouped conv Python examples not found")
 
-    def test_01_basic_conv(self):
-        """Test basic conv example."""
-        example = self.conv_examples_dir / "01_basic_conv.py"
+    def test_01_basic_grouped_conv(self):
+        """Test basic grouped conv example."""
+        example = self.conv_examples_dir / "01_basic_grouped_conv.py"
         if not example.exists():
             self.skipTest(f"{example.name} not found")
-
         result = run_python_example(example)
-
         self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
-        self.assertIn("TFLOPS", result.stdout, "Should report TFLOPS")
+        self.assertIn("PASS", result.stdout.upper())
 
-    def test_02_conv2d_fwd(self):
-        """Test 2D forward conv example."""
-        example = self.conv_examples_dir / "02_conv2d_fwd.py"
+    def test_02_forward(self):
+        """Test forward conv example (2D + 3D)."""
+        example = self.conv_examples_dir / "02_forward.py"
         if not example.exists():
             self.skipTest(f"{example.name} not found")
-
         result = run_python_example(example)
-
         self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
+        self.assertIn("PASS", result.stdout.upper())
 
-    def test_03_conv3d_fwd(self):
-        """Test 3D forward conv example."""
-        example = self.conv_examples_dir / "03_conv3d_fwd.py"
+    def test_03_bwd_data(self):
+        """Test backward data example."""
+        example = self.conv_examples_dir / "03_bwd_data.py"
         if not example.exists():
             self.skipTest(f"{example.name} not found")
-
         result = run_python_example(example)
-
         self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
+        self.assertIn("PASS", result.stdout.upper())
 
-    def test_07_validation(self):
-        """Test validation example."""
-        example = self.conv_examples_dir / "07_validation.py"
+    def test_04_bwd_weight(self):
+        """Test backward weight example."""
+        example = self.conv_examples_dir / "04_bwd_weight.py"
         if not example.exists():
             self.skipTest(f"{example.name} not found")
-
         result = run_python_example(example)
-
         self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
-        self.assertIn("PASS", result.stdout.upper(), "Validation should pass")
+        self.assertIn("PASS", result.stdout.upper())
+
+    def test_05_benchmark(self):
+        """Test benchmark example."""
+        example = self.conv_examples_dir / "05_benchmark.py"
+        if not example.exists():
+            self.skipTest(f"{example.name} not found")
+        result = run_python_example(
+            example, extra_args=["--warmup", "1", "--repeat", "1"]
+        )
+        self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
+        self.assertIn("PASS", result.stdout.upper())
+
+    def test_06_registry_json(self):
+        """Test registry + heuristic + JSON example."""
+        example = self.conv_examples_dir / "06_registry_json.py"
+        if not example.exists():
+            self.skipTest(f"{example.name} not found")
+        result = run_python_example(example)
+        self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
+        self.assertIn("PASS", result.stdout.upper())
 
 
 class TestGemmCppExamples(unittest.TestCase):
@@ -195,18 +212,18 @@ class TestGemmCppExamples(unittest.TestCase):
 
         self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
 
-    def test_gemm_04_validation(self):
-        """Test validation GEMM C++ example."""
-        result = run_cpp_example("gemm_04_validation")
+    def test_gemm_03_benchmark_validation(self):
+        """Test benchmark+validation GEMM C++ example."""
+        result = run_cpp_example("gemm_03_benchmark_validation")
         if result is None:
-            self.skipTest("gemm_04_validation not built")
+            self.skipTest("gemm_03_benchmark_validation not built")
 
         self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
         self.assertIn("PASS", result.stdout.upper(), "Validation should pass")
 
 
 class TestConvCppExamples(unittest.TestCase):
-    """Test Conv C++ examples."""
+    """Test grouped conv C++ examples."""
 
     @classmethod
     def setUpClass(cls):
@@ -215,23 +232,29 @@ class TestConvCppExamples(unittest.TestCase):
         if not cls.examples_dir.exists():
             raise unittest.SkipTest("C++ examples not built")
 
-    def test_conv_01_forward(self):
-        """Test forward conv C++ example."""
-        result = run_cpp_example("conv_01_forward")
+    def test_grouped_conv_01_basic(self):
+        """Test basic grouped conv C++ example."""
+        result = run_cpp_example("grouped_conv_01_basic")
         if result is None:
-            self.skipTest("conv_01_forward not built")
-
+            self.skipTest("grouped_conv_01_basic not built")
         self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
-        self.assertIn("TFLOPS", result.stdout, "Should report TFLOPS")
+        self.assertIn("PASS", result.stdout.upper())
 
-    def test_conv_02_validation(self):
-        """Test validation conv C++ example."""
-        result = run_cpp_example("conv_02_validation")
+    def test_grouped_conv_02_all_dirs(self):
+        """Test all directions grouped conv C++ example."""
+        result = run_cpp_example("grouped_conv_02_all_dirs")
         if result is None:
-            self.skipTest("conv_02_validation not built")
-
+            self.skipTest("grouped_conv_02_all_dirs not built")
         self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
-        self.assertIn("PASS", result.stdout.upper(), "Validation should pass")
+        self.assertIn("PASS", result.stdout.upper())
+
+    def test_grouped_conv_03_bench_val(self):
+        """Test benchmark+validation grouped conv C++ example."""
+        result = run_cpp_example("grouped_conv_03_bench_val")
+        if result is None:
+            self.skipTest("grouped_conv_03_bench_val not built")
+        self.assertEqual(result.returncode, 0, f"Example failed:\n{result.stderr}")
+        self.assertIn("PASS", result.stdout.upper())
 
 
 class TestUtilityImports(unittest.TestCase):
@@ -246,14 +269,18 @@ class TestUtilityImports(unittest.TestCase):
         except ImportError as e:
             self.fail(f"Failed to import ctypes_utils: {e}")
 
-    def test_import_conv_utils(self):
-        """Test importing conv_utils."""
+    def test_import_grouped_conv_utils(self):
+        """Test importing grouped_conv_utils."""
         try:
-            from conv_utils import ConvSignature, ConvAlgorithm, ConvProblem  # noqa: F401
+            from grouped_conv_utils import (  # noqa: F401
+                GroupedConvValidationResult,
+                validate_grouped_conv_config,
+                GroupedConvDataType,
+            )
 
             self.assertTrue(True)
         except ImportError as e:
-            self.fail(f"Failed to import conv_utils: {e}")
+            self.fail(f"Failed to import grouped_conv_utils: {e}")
 
     def test_kernel_config_creation(self):
         """Test creating a KernelConfig."""
@@ -272,22 +299,19 @@ class TestUtilityImports(unittest.TestCase):
         self.assertEqual(config.dtype_a, "fp16")
         self.assertEqual(config.layout_a, "row")
 
-    def test_conv_signature_creation(self):
-        """Test creating a ConvSignature."""
-        from conv_utils import ConvSignature
+    def test_grouped_conv_default_config(self):
+        """Test creating a grouped conv default config."""
+        from grouped_conv_utils import get_grouped_conv_default_config
 
-        sig = ConvSignature(
-            dtype_in="fp16",
-            dtype_wei="fp16",
-            dtype_out="fp16",
-            dtype_acc="fp32",
-            layout="nhwgc",
-            direction="forward",
-            num_dims=2,
+        config = get_grouped_conv_default_config(
+            variant="forward",
+            ndim_spatial=2,
+            arch="gfx942",
         )
 
-        self.assertEqual(sig.dtype_in, "fp16")
-        self.assertEqual(sig.direction, "forward")
+        d = config.to_dict() if hasattr(config, "to_dict") else config
+        self.assertEqual(d["variant"], "forward")
+        self.assertEqual(d["arch"], "gfx942")
 
 
 class TestAutoCorrection(unittest.TestCase):
@@ -316,21 +340,22 @@ class TestAutoCorrection(unittest.TestCase):
         self.assertTrue(was_modified, "Config should be modified")
         self.assertGreater(len(corrections), 0, "Should have corrections")
 
-    def test_conv_auto_correct(self):
-        """Test Conv auto-correction."""
-        from conv_utils import auto_correct_conv_config
-
-        # Call with invalid wave config parameters
-        corrected, was_modified, corrections = auto_correct_conv_config(
-            wave_m=99,  # Invalid
-            wave_n=99,  # Invalid
-            wave_k=99,  # Invalid
-            dtype="fp16",
-            arch="gfx942",
+    def test_grouped_conv_auto_correct(self):
+        """Test Grouped Conv auto-correction."""
+        from grouped_conv_utils import (
+            auto_correct_grouped_conv_config,
+            get_grouped_conv_default_config,
         )
 
-        self.assertTrue(was_modified, "Config should be modified")
-        self.assertGreater(len(corrections), 0, "Should have corrections")
+        config = get_grouped_conv_default_config()
+        d = config.to_dict() if hasattr(config, "to_dict") else config
+        d["tile_config"]["warp_m"] = [99]
+        d["tile_config"]["warp_n"] = [99]
+
+        corrected, result = auto_correct_grouped_conv_config(d)
+
+        self.assertIsInstance(corrected, dict)
+        self.assertIn("tile_config", corrected)
 
 
 if __name__ == "__main__":
diff --git a/dispatcher/tests/test_grouped_conv_codegen.py b/dispatcher/tests/test_grouped_conv_codegen.py
new file mode 100644
index 0000000000..acfa5abd8f
--- /dev/null
+++ b/dispatcher/tests/test_grouped_conv_codegen.py
@@ -0,0 +1,589 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+TDD tests for codegen/unified_grouped_conv_codegen.py -- grouped convolution code generator.
+
+These tests are written BEFORE the implementation exists.
+Run: python3 -m pytest dispatcher/tests/test_grouped_conv_codegen.py -v
+"""
+
+import sys
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DISPATCHER_DIR = SCRIPT_DIR.parent
+sys.path.insert(0, str(DISPATCHER_DIR / "codegen"))
+sys.path.insert(0, str(DISPATCHER_DIR / "python"))
+
+from codegen_common import TileConfig, TraitConfigBase  # noqa: E402
+
+from unified_grouped_conv_codegen import (  # noqa: E402
+    GroupedConvVariant,
+    GroupedConvLayout,
+    GroupedConvKernelConfig,
+    GroupedConvTypeMappings,
+    GroupedConvTraitConfig,
+    CKTileGroupedConvKernelGenerator,
+    GroupedConvDispatcherWrapperGenerator,
+    UnifiedGroupedConvCodegen,
+)
+
+
+# =============================================================================
+# TestGroupedConvVariant
+# =============================================================================
+
+
+class TestGroupedConvVariant(unittest.TestCase):
+    """Test GroupedConvVariant enum values."""
+
+    def test_forward_value(self):
+        self.assertEqual(GroupedConvVariant.FORWARD.value, "forward")
+
+    def test_backward_data_value(self):
+        self.assertEqual(GroupedConvVariant.BACKWARD_DATA.value, "bwd_data")
+
+    def test_backward_weight_value(self):
+        self.assertEqual(GroupedConvVariant.BACKWARD_WEIGHT.value, "bwd_weight")
+
+    def test_all_variants_exist(self):
+        self.assertIn(GroupedConvVariant.FORWARD, GroupedConvVariant)
+        self.assertIn(GroupedConvVariant.BACKWARD_DATA, GroupedConvVariant)
+        self.assertIn(GroupedConvVariant.BACKWARD_WEIGHT, GroupedConvVariant)
+
+
+# =============================================================================
+# TestGroupedConvLayout
+# =============================================================================
+
+
+class TestGroupedConvLayout(unittest.TestCase):
+    """Test GroupedConvLayout enum for 1D/2D/3D layouts."""
+
+    def test_nhwgc_value(self):
+        self.assertEqual(GroupedConvLayout.NHWGC.value, "NHWGC")
+
+    def test_gkyxc_value(self):
+        self.assertEqual(GroupedConvLayout.GKYXC.value, "GKYXC")
+
+    def test_nhwgk_value(self):
+        self.assertEqual(GroupedConvLayout.NHWGK.value, "NHWGK")
+
+    def test_1d_layouts_exist(self):
+        """1D conv layouts (e.g., NWGC, GYXC, NWGK)."""
+        layouts_1d = [
+            lay
+            for lay in GroupedConvLayout
+            if "W" in lay.value and "H" not in lay.value
+        ]
+        self.assertGreater(len(layouts_1d), 0)
+
+    def test_2d_layouts_exist(self):
+        """2D conv layouts (e.g., NHWGC, GKYXC, NHWGK)."""
+        layouts_2d = [lay for lay in GroupedConvLayout if "HW" in lay.value]
+        self.assertGreater(len(layouts_2d), 0)
+
+    def test_3d_layouts_exist(self):
+        """3D conv layouts (e.g., NDHWGC, GDKYXC)."""
+        layouts_3d = [
+            lay for lay in GroupedConvLayout if "D" in lay.value or "DHW" in lay.value
+        ]
+        self.assertGreater(len(layouts_3d), 0)
+
+
+# =============================================================================
+# TestGroupedConvKernelConfig
+# =============================================================================
+
+
+class TestGroupedConvKernelConfig(unittest.TestCase):
+    """Test GroupedConvKernelConfig dataclass."""
+
+    def _make_tile(self):
+        return TileConfig(128, 128, 32, 2, 2, 1, 32, 32, 16)
+
+    def _make_trait(self):
+        return GroupedConvTraitConfig(
+            "mem",
+            "cshuffle",
+            "intrawave",
+            False,
+            False,
+            False,
+            double_smem_buffer=False,
+            num_groups_to_merge=1,
+        )
+
+    def test_name_contains_grouped_conv_fwd(self):
+        config = GroupedConvKernelConfig(
+            tile=self._make_tile(),
+            trait=self._make_trait(),
+            variant=GroupedConvVariant.FORWARD,
+            ndim_spatial=2,
+            arch="gfx942",
+            layout=GroupedConvLayout.NHWGC,
+            vector_sizes=(4, 4, 4),
+        )
+        name = config.name("fp16")
+        self.assertIn("grouped_conv_fwd", name)
+
+    def test_name_backward_data_contains_bwd_data(self):
+        config = GroupedConvKernelConfig(
+            tile=self._make_tile(),
+            trait=self._make_trait(),
+            variant=GroupedConvVariant.BACKWARD_DATA,
+            ndim_spatial=2,
+            arch="gfx942",
+            layout=GroupedConvLayout.NHWGC,
+            vector_sizes=(4, 4, 4),
+        )
+        name = config.name("fp16")
+        self.assertIn("bwd_data", name)
+
+    def test_is_valid_for_arch_supported(self):
+        config = GroupedConvKernelConfig(
+            tile=self._make_tile(),
+            trait=self._make_trait(),
+            variant=GroupedConvVariant.FORWARD,
+            ndim_spatial=2,
+            arch="gfx942",
+            layout=GroupedConvLayout.NHWGC,
+            vector_sizes=(4, 4, 4),
+        )
+        self.assertTrue(config.is_valid_for_arch("gfx942"))
+
+    def test_is_valid_for_arch_unsupported(self):
+        config = GroupedConvKernelConfig(
+            tile=self._make_tile(),
+            trait=self._make_trait(),
+            variant=GroupedConvVariant.FORWARD,
+            ndim_spatial=2,
+            arch="gfx942",
+            layout=GroupedConvLayout.NHWGC,
+            vector_sizes=(4, 4, 4),
+        )
+        self.assertFalse(config.is_valid_for_arch("gfx600"))
+
+
+# =============================================================================
+# TestGroupedConvTypeMappings
+# =============================================================================
+
+
+class TestGroupedConvTypeMappings(unittest.TestCase):
+    """Test GroupedConvTypeMappings class."""
+
+    def test_dtype_to_ck_fp16(self):
+        self.assertEqual(GroupedConvTypeMappings.DTYPE_TO_CK["fp16"], "half_t")
+
+    def test_dtype_to_ck_bf16(self):
+        self.assertIn("bf16", GroupedConvTypeMappings.DTYPE_TO_CK)
+
+    def test_dtype_to_ck_fp32(self):
+        self.assertIn("fp32", GroupedConvTypeMappings.DTYPE_TO_CK)
+
+    def test_get_layouts_2d_has_in_wei_out_keys(self):
+        layouts = GroupedConvTypeMappings.get_layouts(2)
+        self.assertIn("in", layouts)
+        self.assertIn("wei", layouts)
+        self.assertIn("out", layouts)
+
+    def test_get_layouts_2d_returns_dict(self):
+        layouts = GroupedConvTypeMappings.get_layouts(2)
+        self.assertIsInstance(layouts, dict)
+
+    def test_get_layouts_1d(self):
+        layouts = GroupedConvTypeMappings.get_layouts(1)
+        self.assertIn("in", layouts)
+        self.assertIn("wei", layouts)
+        self.assertIn("out", layouts)
+
+    def test_get_layouts_3d(self):
+        layouts = GroupedConvTypeMappings.get_layouts(3)
+        self.assertIn("in", layouts)
+        self.assertIn("wei", layouts)
+        self.assertIn("out", layouts)
+
+
+# =============================================================================
+# TestCKTileGroupedConvKernelGenerator
+# =============================================================================
+
+
+class TestCKTileGroupedConvKernelGenerator(unittest.TestCase):
+    """Test CKTileGroupedConvKernelGenerator.generate()."""
+
+    def _make_config(self):
+        tile = TileConfig(128, 128, 32, 2, 2, 1, 32, 32, 16)
+        trait = GroupedConvTraitConfig(
+            "mem",
+            "cshuffle",
+            "intrawave",
+            False,
+            False,
+            False,
+            double_smem_buffer=False,
+            num_groups_to_merge=1,
+        )
+        return GroupedConvKernelConfig(
+            tile=tile,
+            trait=trait,
+            variant=GroupedConvVariant.FORWARD,
+            ndim_spatial=2,
+            arch="gfx942",
+            layout=GroupedConvLayout.NHWGC,
+            vector_sizes=(4, 4, 4),
+        )
+
+    def test_generate_contains_pragma_once(self):
+        gen = CKTileGroupedConvKernelGenerator("fp16")
+        config = self._make_config()
+        result = gen.generate(config)
+        self.assertIn("#pragma once", result)
+
+    def test_generate_contains_forward_kernel_include(self):
+        gen = CKTileGroupedConvKernelGenerator("fp16")
+        config = self._make_config()
+        result = gen.generate(config)
+        self.assertIn("grouped_convolution_forward_kernel.hpp", result)
+
+    def test_generate_returns_non_empty_string(self):
+        gen = CKTileGroupedConvKernelGenerator("fp16")
+        config = self._make_config()
+        result = gen.generate(config)
+        self.assertIsInstance(result, str)
+        self.assertGreater(len(result), 100)
+
+    def test_generate_valid_cpp_structure(self):
+        gen = CKTileGroupedConvKernelGenerator("fp16")
+        config = self._make_config()
+        result = gen.generate(config)
+        self.assertIn("#include", result)
+        self.assertIn("ck_tile", result)
+
+
+# =============================================================================
+# TestGroupedConvDispatcherWrapperGenerator
+# =============================================================================
+
+
+class TestGroupedConvDispatcherWrapperGenerator(unittest.TestCase):
+    """Test GroupedConvDispatcherWrapperGenerator.generate()."""
+
+    def _make_config(self):
+        tile = TileConfig(128, 128, 32, 2, 2, 1, 32, 32, 16)
+        trait = GroupedConvTraitConfig(
+            "mem",
+            "cshuffle",
+            "intrawave",
+            False,
+            False,
+            False,
+            double_smem_buffer=False,
+            num_groups_to_merge=1,
+        )
+        return GroupedConvKernelConfig(
+            tile=tile,
+            trait=trait,
+            variant=GroupedConvVariant.FORWARD,
+            ndim_spatial=2,
+            arch="gfx942",
+            layout=GroupedConvLayout.NHWGC,
+            vector_sizes=(4, 4, 4),
+        )
+
+    def test_generate_contains_dispatcher_registration(self):
+        gen = GroupedConvDispatcherWrapperGenerator("fp16")
+        config = self._make_config()
+        kernel_path = DISPATCHER_DIR / "build" / "generated" / "test_kernel.hpp"
+        output_dir = DISPATCHER_DIR / "build" / "generated"
+        result = gen.generate(config, kernel_path, output_dir)
+        self.assertIn("dispatcher", result)
+        self.assertIn("KernelKey", result)
+        self.assertIn("KernelInstancePtr", result)
+
+    def test_generate_contains_pragma_once(self):
+        gen = GroupedConvDispatcherWrapperGenerator("fp16")
+        config = self._make_config()
+        kernel_path = DISPATCHER_DIR / "build" / "generated" / "test_kernel.hpp"
+        output_dir = DISPATCHER_DIR / "build" / "generated"
+        result = gen.generate(config, kernel_path, output_dir)
+        self.assertIn("#pragma once", result)
+
+    def test_generate_valid_cpp(self):
+        gen = GroupedConvDispatcherWrapperGenerator("fp16")
+        config = self._make_config()
+        kernel_path = DISPATCHER_DIR / "build" / "generated" / "test_kernel.hpp"
+        output_dir = DISPATCHER_DIR / "build" / "generated"
+        result = gen.generate(config, kernel_path, output_dir)
+        self.assertIn("#include", result)
+        self.assertIn("namespace", result)
+
+
+# =============================================================================
+# TestUnifiedGroupedConvCodegen
+# =============================================================================
+
+
+class TestUnifiedGroupedConvCodegen(unittest.TestCase):
+    """Test UnifiedGroupedConvCodegen.generate_all()."""
+
+    def test_generate_all_returns_dict_with_expected_keys(self):
+        output_dir = DISPATCHER_DIR / "build" / "generated" / "grouped_conv"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        codegen = UnifiedGroupedConvCodegen(
+            output_dir=output_dir,
+            datatype="fp16",
+            ndim_spatial=2,
+            gpu_target="gfx942",
+        )
+        with patch.object(
+            codegen,
+            "_get_configs",
+            return_value=[],  # Mock empty config list for fast test
+        ):
+            results = codegen.generate_all(parallel=False)
+        self.assertIn("kernels", results)
+        self.assertIn("wrappers", results)
+        self.assertIn("failed", results)
+        self.assertIsInstance(results["kernels"], list)
+        self.assertIsInstance(results["wrappers"], list)
+        self.assertIsInstance(results["failed"], list)
+
+    def test_generate_all_with_mock_config_produces_output(self):
+        output_dir = DISPATCHER_DIR / "build" / "generated" / "grouped_conv_test"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        codegen = UnifiedGroupedConvCodegen(
+            output_dir=output_dir,
+            datatype="fp16",
+            ndim_spatial=2,
+            gpu_target="gfx942",
+        )
+        # Use a real config - patch the config source to return one config
+        tile = TileConfig(128, 128, 32, 2, 2, 1, 32, 32, 16)
+        trait = GroupedConvTraitConfig(
+            "mem",
+            "cshuffle",
+            "intrawave",
+            False,
+            False,
+            False,
+            double_smem_buffer=False,
+            num_groups_to_merge=1,
+        )
+        config = GroupedConvKernelConfig(
+            tile=tile,
+            trait=trait,
+            variant=GroupedConvVariant.FORWARD,
+            ndim_spatial=2,
+            arch="gfx942",
+            layout=GroupedConvLayout.NHWGC,
+            vector_sizes=(4, 4, 4),
+        )
+
+        with patch.object(codegen, "_get_configs", return_value=[config]):
+            results = codegen.generate_all(parallel=False)
+        self.assertIsInstance(results, dict)
+        self.assertIn("kernels", results)
+
+
+# =============================================================================
+# TestSharedImports
+# =============================================================================
+
+
+class TestSharedImports(unittest.TestCase):
+    """Verify TileConfig from codegen_common and GroupedConvTraitConfig extends TraitConfigBase."""
+
+    def test_tile_config_has_expected_fields(self):
+        """TileConfig from codegen_common has tile_m, tile_n, tile_k, etc."""
+        tc = TileConfig(128, 128, 32, 2, 2, 1, 32, 32, 16)
+        self.assertEqual(tc.tile_m, 128)
+        self.assertEqual(tc.tile_n, 128)
+        self.assertEqual(tc.tile_k, 32)
+        self.assertEqual(tc.warp_m, 2)
+        self.assertEqual(tc.warp_n, 2)
+        self.assertEqual(tc.warp_k, 1)
+        self.assertEqual(tc.warp_tile_m, 32)
+        self.assertEqual(tc.warp_tile_n, 32)
+        self.assertEqual(tc.warp_tile_k, 16)
+
+    def test_tile_config_is_from_codegen_common(self):
+        """TileConfig used by grouped conv is the same as codegen_common.TileConfig."""
+        tc = TileConfig(128, 128, 32, 2, 2, 1, 32, 32, 16)
+        self.assertTrue(tc.is_valid())
+
+    def test_grouped_conv_trait_config_extends_trait_config_base(self):
+        """GroupedConvTraitConfig extends TraitConfigBase."""
+        self.assertTrue(issubclass(GroupedConvTraitConfig, TraitConfigBase))
+
+    def test_grouped_conv_trait_config_has_double_smem_buffer(self):
+        """GroupedConvTraitConfig has double_smem_buffer field."""
+        trait = GroupedConvTraitConfig(
+            "mem",
+            "cshuffle",
+            "intrawave",
+            False,
+            False,
+            False,
+            double_smem_buffer=True,
+            num_groups_to_merge=2,
+        )
+        self.assertTrue(trait.double_smem_buffer)
+        self.assertEqual(trait.num_groups_to_merge, 2)
+
+    def test_grouped_conv_trait_config_has_num_groups_to_merge(self):
+        """GroupedConvTraitConfig has num_groups_to_merge field."""
+        trait = GroupedConvTraitConfig(
+            "mem",
+            "cshuffle",
+            "intrawave",
+            False,
+            False,
+            False,
+            double_smem_buffer=False,
+            num_groups_to_merge=4,
+        )
+        self.assertEqual(trait.num_groups_to_merge, 4)
+
+    def test_grouped_conv_trait_config_inherits_base_fields(self):
+        """GroupedConvTraitConfig inherits pipeline, epilogue, scheduler from base."""
+        trait = GroupedConvTraitConfig(
+            "compv4",
+            "cshuffle",
+            "intrawave",
+            True,
+            True,
+            True,
+            double_smem_buffer=False,
+            num_groups_to_merge=1,
+        )
+        self.assertEqual(trait.pipeline, "compv4")
+        self.assertEqual(trait.epilogue, "cshuffle")
+        self.assertEqual(trait.scheduler, "intrawave")
+        self.assertTrue(trait.pad_m)
+        self.assertTrue(trait.pad_n)
+        self.assertTrue(trait.pad_k)
+
+
+# =============================================================================
+# TestTwoStageBwdWeightCodegen
+# =============================================================================
+
+
+def _make_two_stage_config():
+    """Helper: create a two-stage bwd_weight config."""
+    return GroupedConvKernelConfig(
+        tile=TileConfig(16, 64, 64, 1, 4, 1, 16, 16, 32),
+        trait=GroupedConvTraitConfig(
+            pipeline="compv3",
+            epilogue="cshuffle",
+            scheduler="intrawave",
+            pad_m=True,
+            pad_n=True,
+            pad_k=True,
+            two_stage=True,
+        ),
+        variant=GroupedConvVariant.BACKWARD_WEIGHT,
+        ndim_spatial=2,
+        arch="gfx942",
+    )
+
+
+class TestTwoStageBwdWeightCodegen(unittest.TestCase):
+    """Tests for two-stage backward weight kernel generation."""
+
+    def test_kernel_name_contains_2stage(self):
+        config = _make_two_stage_config()
+        name = config.name("fp16")
+        self.assertIn("_2stage", name)
+        self.assertIn("bwd_weight", name)
+
+    def test_single_stage_name_has_no_2stage(self):
+        config = _make_two_stage_config()
+        config.trait.two_stage = False
+        name = config.name("fp16")
+        self.assertNotIn("_2stage", name)
+
+    def test_generate_contains_elementwise_include(self):
+        config = _make_two_stage_config()
+        gen = CKTileGroupedConvKernelGenerator(
+            "fp16", GroupedConvVariant.BACKWARD_WEIGHT
+        )
+        code = gen.generate(config)
+        self.assertIn("elementwise.hpp", code)
+
+    def test_generate_contains_workspace_type(self):
+        config = _make_two_stage_config()
+        gen = CKTileGroupedConvKernelGenerator(
+            "fp16", GroupedConvVariant.BACKWARD_WEIGHT
+        )
+        code = gen.generate(config)
+        self.assertIn("WorkspaceDataType", code)
+
+    def test_generate_contains_elementwise_kernel(self):
+        config = _make_two_stage_config()
+        gen = CKTileGroupedConvKernelGenerator(
+            "fp16", GroupedConvVariant.BACKWARD_WEIGHT
+        )
+        code = gen.generate(config)
+        self.assertIn("ElementWiseKernel", code)
+
+    def test_generate_contains_launch_kernel_time_mask(self):
+        config = _make_two_stage_config()
+        gen = CKTileGroupedConvKernelGenerator(
+            "fp16", GroupedConvVariant.BACKWARD_WEIGHT
+        )
+        code = gen.generate(config)
+        self.assertIn("launch_kernel_time_mask", code)
+
+    def test_generate_forces_vector_size_c_to_1(self):
+        config = _make_two_stage_config()
+        gen = CKTileGroupedConvKernelGenerator(
+            "fp16", GroupedConvVariant.BACKWARD_WEIGHT
+        )
+        code = gen.generate(config)
+        self.assertIn("VectorSizeC_TwoStage = 1", code)
+
+    def test_generate_contains_workspace_memset(self):
+        config = _make_two_stage_config()
+        gen = CKTileGroupedConvKernelGenerator(
+            "fp16", GroupedConvVariant.BACKWARD_WEIGHT
+        )
+        code = gen.generate(config)
+        self.assertIn("hipMemsetAsync", code)
+
+    def test_single_stage_does_not_contain_workspace(self):
+        config = _make_two_stage_config()
+        config.trait.two_stage = False
+        gen = CKTileGroupedConvKernelGenerator(
+            "fp16", GroupedConvVariant.BACKWARD_WEIGHT
+        )
+        code = gen.generate(config)
+        self.assertNotIn("WorkspaceDataType", code)
+        self.assertNotIn("ElementWiseKernel", code)
+        self.assertNotIn("launch_kernel_time_mask", code)
+
+    def test_default_configs_include_two_stage(self):
+        from unified_grouped_conv_codegen import get_default_configs
+
+        configs = get_default_configs(
+            arch="gfx942",
+            variants=[GroupedConvVariant.BACKWARD_WEIGHT],
+            ndims=[2],
+        )
+        two_stage = [c for c in configs if c.trait.two_stage]
+        single_stage = [c for c in configs if not c.trait.two_stage]
+        self.assertGreater(len(two_stage), 0, "Should have two-stage configs")
+        self.assertGreater(
+            len(single_stage), 0, "Should still have single-stage configs"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/dispatcher/tests/test_grouped_conv_config.cpp b/dispatcher/tests/test_grouped_conv_config.cpp
new file mode 100644
index 0000000000..c9a1faeaf9
--- /dev/null
+++ b/dispatcher/tests/test_grouped_conv_config.cpp
@@ -0,0 +1,112 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/// Unit tests for GroupedConvConfig using assert() and std::cout
+
+#include "ck_tile/dispatcher/grouped_conv_config.hpp"
+#include <cassert>
+#include <iostream>
+
+using namespace ck_tile::dispatcher;
+
+void test_grouped_conv_direction_enum()
+{
+    std::cout << "  test_grouped_conv_direction_enum... ";
+    assert(GroupedConvSignatureInfo::direction_str(GroupedConvDirection::FORWARD) ==
+           std::string("fwd"));
+    assert(GroupedConvSignatureInfo::direction_str(GroupedConvDirection::BACKWARD_DATA) ==
+           std::string("bwd_data"));
+    assert(GroupedConvSignatureInfo::direction_str(GroupedConvDirection::BACKWARD_WEIGHT) ==
+           std::string("bwd_weight"));
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_signature_info()
+{
+    std::cout << "  test_grouped_conv_signature_info... ";
+    GroupedConvSignatureInfo sig;
+    assert(sig.spatial_dim == 2);
+    assert(sig.direction == GroupedConvDirection::FORWARD);
+    assert(sig.in_type == "fp16");
+    assert(sig.wei_type == "fp16");
+    assert(sig.out_type == "fp16");
+    assert(sig.acc_type == "fp32");
+    assert(sig.num_groups == 1);
+    sig.in_type    = "bf16";
+    sig.num_groups = 4;
+    assert(sig.in_type == "bf16");
+    assert(sig.num_groups == 4);
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_algorithm_info()
+{
+    std::cout << "  test_grouped_conv_algorithm_info... ";
+    GroupedConvAlgorithmInfo algo;
+    assert(algo.tile.m == 128);
+    assert(algo.tile.n == 128);
+    assert(algo.tile.k == 64);
+    assert(algo.pipeline == PipelineVersion::V4);
+    assert(algo.scheduler == PipelineScheduler::INTRAWAVE);
+    assert(GroupedConvAlgorithmInfo::pipeline_str(PipelineVersion::V4) == std::string("compv4"));
+    assert(GroupedConvAlgorithmInfo::scheduler_str(PipelineScheduler::INTRAWAVE) ==
+           std::string("intrawave"));
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_config()
+{
+    std::cout << "  test_grouped_conv_config... ";
+    GroupedConvConfig cfg;
+    std::string name = cfg.name();
+    assert(!name.empty());
+    assert(name.find("grouped_conv_") != std::string::npos);
+    assert(name.find("fwd") != std::string::npos);
+    assert(name.find("fp16") != std::string::npos);
+    assert(name.find("2d") != std::string::npos);
+
+    std::string brief = cfg.brief();
+    assert(!brief.empty());
+    assert(brief.find("2D") != std::string::npos || brief.find("Grouped") != std::string::npos);
+
+    std::string detailed = cfg.detailed();
+    assert(!detailed.empty());
+    assert(detailed.find("Signature:") != std::string::npos);
+    assert(detailed.find("Algorithm:") != std::string::npos);
+    assert(detailed.find("Arch:") != std::string::npos);
+    std::cout << "PASSED\n";
+}
+
+void test_predefined_grouped_conv_configs()
+{
+    std::cout << "  test_predefined_grouped_conv_configs... ";
+    configs::Memory<float> mem_cfg;
+    assert(mem_cfg.algorithm.pipeline == PipelineVersion::MEMORY);
+    assert(mem_cfg.algorithm.tile.m == 128);
+    assert(mem_cfg.algorithm.tile.n == 32);
+
+    configs::CompV3_Small<float> compv3_small;
+    assert(compv3_small.algorithm.pipeline == PipelineVersion::V3);
+    assert(compv3_small.algorithm.tile.m == 16);
+    assert(compv3_small.algorithm.tile.n == 64);
+
+    configs::CompV4<float> compv4;
+    assert(compv4.algorithm.pipeline == PipelineVersion::V4);
+    assert(compv4.algorithm.double_smem_buffer == true);
+
+    configs::WMMA<float> wmma_cfg;
+    assert(wmma_cfg.arch.name == "gfx1100");
+    std::cout << "PASSED\n";
+}
+
+int main()
+{
+    std::cout << "\n=== Test Grouped Conv Config ===\n\n";
+    test_grouped_conv_direction_enum();
+    test_grouped_conv_signature_info();
+    test_grouped_conv_algorithm_info();
+    test_grouped_conv_config();
+    test_predefined_grouped_conv_configs();
+    std::cout << "\n=== All Tests Passed! ===\n\n";
+    return 0;
+}
diff --git a/dispatcher/tests/test_grouped_conv_kernel_decl.cpp b/dispatcher/tests/test_grouped_conv_kernel_decl.cpp
new file mode 100644
index 0000000000..7b28a451bc
--- /dev/null
+++ b/dispatcher/tests/test_grouped_conv_kernel_decl.cpp
@@ -0,0 +1,141 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/// Unit tests for GroupedConvKernelDecl using assert() and std::cout
+
+#include "ck_tile/dispatcher/grouped_conv_kernel_decl.hpp"
+#include <cassert>
+#include <iostream>
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::grouped_conv_decl;
+
+void test_grouped_conv_signature_builder()
+{
+    std::cout << "  test_grouped_conv_signature_builder... ";
+    GroupedConvSignature sig;
+    sig.dtype("fp16").layout("nhwc").conv_type("forward").dims(2).groups(4);
+    assert(sig.dtype_in_ == "fp16");
+    assert(sig.dtype_wei_ == "fp16");
+    assert(sig.dtype_out_ == "fp16");
+    assert(sig.layout_ == "nhwc");
+    assert(sig.conv_op_ == "forward");
+    assert(sig.num_dims_ == 2);
+    assert(sig.groups_ == 4);
+    assert(sig.op_str() == "fwd");
+    sig.conv_type("bwd_data");
+    assert(sig.op_str() == "bwd_data");
+    sig.conv_type("bwd_weight");
+    assert(sig.op_str() == "bwd_weight");
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_algorithm_builder()
+{
+    std::cout << "  test_grouped_conv_algorithm_builder... ";
+    GroupedConvAlgorithm algo;
+    algo.tile(128, 128, 64)
+        .wave(2, 2, 1)
+        .warp(32, 32, 16)
+        .pipeline("compv4")
+        .scheduler("intrawave");
+    assert(algo.tile_m_ == 128);
+    assert(algo.tile_n_ == 128);
+    assert(algo.tile_k_ == 64);
+    assert(algo.wave_m_ == 2);
+    assert(algo.wave_n_ == 2);
+    assert(algo.warp_m_ == 32);
+    assert(algo.warp_n_ == 32);
+    assert(algo.pipeline_ == "compv4");
+    assert(algo.scheduler_ == "intrawave");
+    assert(!algo.needs_expansion());
+    algo.wave_m_ = ANY_INT;
+    assert(algo.needs_wave_expansion());
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_kernel_decl()
+{
+    std::cout << "  test_grouped_conv_kernel_decl... ";
+    GroupedConvSignature sig;
+    sig.dtype("fp16").layout("nhwc").conv_type("forward").dims(2);
+    GroupedConvAlgorithm algo;
+    algo.tile(128, 128, 64).wave(2, 2, 1).warp(32, 32, 16);
+    GroupedConvKernelDecl decl(sig, algo, "gfx942");
+    std::string name = decl.name();
+    assert(!name.empty());
+    assert(name.find("grouped_conv_") != std::string::npos);
+    assert(name.find("fwd") != std::string::npos);
+    assert(name.find("fp16") != std::string::npos);
+    assert(name.find("128x128x64") != std::string::npos);
+    assert(!decl.has_wildcards());
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_kernel_set()
+{
+    std::cout << "  test_grouped_conv_kernel_set... ";
+    GroupedConvKernelSet set;
+    set.add("fp16", "nhwc", "forward", 128, 128);
+    assert(set.size() == 1);
+    set.add("fp16", "nhwc", "forward", 256, 256);
+    assert(set.size() == 2);
+    const auto& decls = set.declarations();
+    assert(decls[0].algorithm.tile_n_ == 128);
+    assert(decls[0].algorithm.tile_k_ == 128);
+    assert(decls[1].algorithm.tile_n_ == 256);
+    assert(decls[1].algorithm.tile_k_ == 256);
+    set.tag("test_set");
+    assert(set.tag() == "test_set");
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_kernel_set_merge()
+{
+    std::cout << "  test_grouped_conv_kernel_set_merge... ";
+    GroupedConvKernelSet set1;
+    set1.add("fp16", "nhwc", "forward", 128, 128);
+    GroupedConvKernelSet set2;
+    set2.add("fp16", "nhwc", "forward", 256, 256);
+    set1.merge(set2);
+    assert(set1.size() == 2);
+    assert(set1.declarations()[0].algorithm.tile_n_ == 128);
+    assert(set1.declarations()[1].algorithm.tile_n_ == 256);
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_kernel_set_registry()
+{
+    std::cout << "  test_grouped_conv_kernel_set_registry... ";
+    auto& reg = GroupedConvKernelSetRegistry::instance();
+    reg.clear();
+
+    GroupedConvKernelSet set;
+    set.add("fp16", "nhwc", "forward", 128, 128);
+    reg.register_set("gconv_test", set);
+    assert(reg.has("gconv_test"));
+    assert(reg.size() >= 1);
+
+    const auto& retrieved = reg.get("gconv_test");
+    assert(retrieved.size() == 1);
+
+    const auto& empty = reg.get("nonexistent");
+    assert(empty.size() == 0);
+
+    reg.clear();
+    assert(!reg.has("gconv_test"));
+    std::cout << "PASSED\n";
+}
+
+int main()
+{
+    std::cout << "\n=== Test Grouped Conv Kernel Decl ===\n\n";
+    test_grouped_conv_signature_builder();
+    test_grouped_conv_algorithm_builder();
+    test_grouped_conv_kernel_decl();
+    test_grouped_conv_kernel_set();
+    test_grouped_conv_kernel_set_merge();
+    test_grouped_conv_kernel_set_registry();
+    std::cout << "\n=== All Tests Passed! ===\n\n";
+    return 0;
+}
diff --git a/dispatcher/tests/test_grouped_conv_problem.cpp b/dispatcher/tests/test_grouped_conv_problem.cpp
new file mode 100644
index 0000000000..a6a4d8ba08
--- /dev/null
+++ b/dispatcher/tests/test_grouped_conv_problem.cpp
@@ -0,0 +1,245 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/// Unit tests for GroupedConvProblem using assert() and std::cout
+
+#include "ck_tile/dispatcher/grouped_conv_problem.hpp"
+#include <cassert>
+#include <iostream>
+#include <stdexcept>
+
+using namespace ck_tile::dispatcher;
+
+void test_grouped_conv_problem_defaults()
+{
+    std::cout << "  test_grouped_conv_problem_defaults... ";
+    GroupedConvProblem p;
+    assert(p.N == 1);
+    assert(p.C == 64);
+    assert(p.K == 64);
+    assert(p.G == 1);
+    assert(p.Hi() == 28);
+    assert(p.Wi() == 28);
+    assert(p.Y() == 3);
+    assert(p.X() == 3);
+    assert(p.op == GroupedConvOp::Forward);
+    assert(p.stride[0] == 1 && p.stride[1] == 1 && p.stride[2] == 1);
+    assert(p.padding[0] == 0 && p.padding[1] == 0 && p.padding[2] == 0);
+    assert(p.dilation[0] == 1 && p.dilation[1] == 1 && p.dilation[2] == 1);
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_problem_2d()
+{
+    std::cout << "  test_grouped_conv_problem_2d... ";
+    GroupedConvProblem p(4, 64, 128, 28, 28, 3, 3);
+    p.compute_output_size();
+    assert(p.N == 4);
+    assert(p.C == 64);
+    assert(p.K == 128);
+    assert(p.Hi() == 28);
+    assert(p.Wi() == 28);
+    assert(p.Y() == 3);
+    assert(p.X() == 3);
+    assert(p.Ho() == 26);
+    assert(p.Wo() == 26);
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_problem_strided()
+{
+    std::cout << "  test_grouped_conv_problem_strided... ";
+    GroupedConvProblem p;
+    p.N              = 1;
+    p.C              = 64;
+    p.K              = 64;
+    p.G              = 1;
+    p.input_spatial  = {1, 14, 14};
+    p.filter_spatial = {1, 3, 3};
+    p.stride         = {1, 2, 2};
+    p.padding        = {0, 1, 1};
+    p.dilation       = {1, 1, 1};
+    p.compute_output_size();
+    assert(p.Ho() == 7);
+    assert(p.Wo() == 7);
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_problem_grouped()
+{
+    std::cout << "  test_grouped_conv_problem_grouped... ";
+    GroupedConvProblem p;
+    p.N              = 2;
+    p.C              = 64;
+    p.K              = 64;
+    p.G              = 4;
+    p.input_spatial  = {1, 14, 14};
+    p.filter_spatial = {1, 3, 3};
+    p.stride         = {1, 1, 1};
+    p.padding        = {0, 0, 0};
+    p.dilation       = {1, 1, 1};
+    p.compute_output_size();
+    assert(p.G == 4);
+    assert(p.C % p.G == 0);
+    assert(p.K % p.G == 0);
+    assert(p.is_valid());
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_problem_depthwise()
+{
+    std::cout << "  test_grouped_conv_problem_depthwise... ";
+    GroupedConvProblem p;
+    p.N              = 2;
+    p.C              = 64;
+    p.K              = 64;
+    p.G              = 64;
+    p.input_spatial  = {1, 14, 14};
+    p.filter_spatial = {1, 3, 3};
+    p.stride         = {1, 1, 1};
+    p.padding        = {0, 0, 0};
+    p.dilation       = {1, 1, 1};
+    p.compute_output_size();
+    assert(p.is_depthwise());
+    assert(p.G == p.C && p.G == p.K);
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_problem_pointwise()
+{
+    std::cout << "  test_grouped_conv_problem_pointwise... ";
+    GroupedConvProblem p;
+    p.N              = 2;
+    p.C              = 64;
+    p.K              = 128;
+    p.G              = 1;
+    p.input_spatial  = {1, 14, 14};
+    p.filter_spatial = {1, 1, 1};
+    p.stride         = {1, 1, 1};
+    p.padding        = {0, 0, 0};
+    p.dilation       = {1, 1, 1};
+    p.compute_output_size();
+    assert(p.is_pointwise());
+    assert(p.Y() == 1 && p.X() == 1);
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_problem_flops()
+{
+    std::cout << "  test_grouped_conv_problem_flops... ";
+    GroupedConvProblem p;
+    p.N              = 2;
+    p.C              = 64;
+    p.K              = 64;
+    p.G              = 1;
+    p.input_spatial  = {1, 14, 14};
+    p.filter_spatial = {1, 3, 3};
+    p.stride         = {1, 1, 1};
+    p.padding        = {0, 0, 0};
+    p.dilation       = {1, 1, 1};
+    p.compute_output_size();
+    double flops = p.get_flops();
+    assert(flops > 0);
+    assert(flops == 2.0 * p.N * p.K * p.Ho() * p.Wo() * (p.C / p.G) * p.Y() * p.X());
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_problem_is_valid()
+{
+    std::cout << "  test_grouped_conv_problem_is_valid... ";
+    GroupedConvProblem p;
+    p.N              = 1;
+    p.C              = 64;
+    p.K              = 64;
+    p.G              = 1;
+    p.input_spatial  = {1, 14, 14};
+    p.filter_spatial = {1, 3, 3};
+    p.compute_output_size();
+    assert(p.is_valid());
+
+    p.N = 0;
+    assert(!p.is_valid());
+    p.N = 1;
+
+    p.C = 0;
+    assert(!p.is_valid());
+    p.C = 64;
+
+    p.K = 0;
+    assert(!p.is_valid());
+    p.K = 64;
+
+    p.G = 0;
+    assert(!p.is_valid());
+    p.G = 1;
+
+    p.C = 64;
+    p.K = 64;
+    p.G = 3;
+    assert(!p.is_valid());
+    p.G = 4;
+    assert(p.is_valid());
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_problem_builder()
+{
+    std::cout << "  test_grouped_conv_problem_builder... ";
+    auto p = GroupedConvProblemBuilder()
+                 .batch(8)
+                 .channels(128, 256)
+                 .groups(4)
+                 .input_size(32, 32)
+                 .filter_size(3, 3)
+                 .stride(2, 2)
+                 .padding(1, 1)
+                 .dilation(1, 1)
+                 .operation(GroupedConvOp::Forward)
+                 .build();
+    assert(p.N == 8);
+    assert(p.C == 128);
+    assert(p.K == 256);
+    assert(p.G == 4);
+    assert(p.Hi() == 32);
+    assert(p.Wi() == 32);
+    assert(p.Y() == 3);
+    assert(p.X() == 3);
+    assert(p.stride[1] == 2 && p.stride[2] == 2);
+    assert(p.padding[1] == 1 && p.padding[2] == 1);
+    assert(p.op == GroupedConvOp::Forward);
+    assert(p.is_valid());
+
+    bool threw = false;
+    try
+    {
+        (void)GroupedConvProblemBuilder()
+            .batch(0)
+            .channels(64, 64)
+            .groups(1)
+            .input_size(14, 14)
+            .filter_size(3, 3)
+            .build();
+    }
+    catch(const std::invalid_argument&)
+    {
+        threw = true;
+    }
+    assert(threw);
+    std::cout << "PASSED\n";
+}
+
+int main()
+{
+    std::cout << "\n=== Test Grouped Conv Problem ===\n\n";
+    test_grouped_conv_problem_defaults();
+    test_grouped_conv_problem_2d();
+    test_grouped_conv_problem_strided();
+    test_grouped_conv_problem_grouped();
+    test_grouped_conv_problem_depthwise();
+    test_grouped_conv_problem_pointwise();
+    test_grouped_conv_problem_flops();
+    test_grouped_conv_problem_is_valid();
+    test_grouped_conv_problem_builder();
+    std::cout << "\n=== All Tests Passed! ===\n\n";
+    return 0;
+}
diff --git a/dispatcher/tests/test_grouped_conv_registry.cpp b/dispatcher/tests/test_grouped_conv_registry.cpp
new file mode 100644
index 0000000000..47d13a9997
--- /dev/null
+++ b/dispatcher/tests/test_grouped_conv_registry.cpp
@@ -0,0 +1,230 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/// Unit tests for GroupedConvRegistry and GroupedConvDispatcher using assert() and std::cout
+
+#include "ck_tile/dispatcher/grouped_conv_utils.hpp"
+#include <cassert>
+#include <iostream>
+#include <thread>
+#include <atomic>
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::grouped_conv_decl;
+
+void test_grouped_conv_registry_basic()
+{
+    std::cout << "  test_grouped_conv_registry_basic... ";
+    GroupedConvRegistry& reg = GroupedConvRegistry::instance();
+    reg.clear();
+
+    reg.set_name("test_registry");
+    assert(reg.name() == "test_registry");
+
+    assert(reg.size() == 0);
+    assert(reg.empty());
+
+    reg.clear();
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_registry_register_set()
+{
+    std::cout << "  test_grouped_conv_registry_register_set... ";
+    GroupedConvRegistry& reg = GroupedConvRegistry::instance();
+    reg.clear();
+
+    GroupedConvKernelSet set;
+    set.add("fp16", "nhwc", "forward", 128, 128);
+    set.add("fp16", "nhwc", "forward", 256, 256);
+
+    bool ok = reg.register_set(set);
+    assert(ok);
+    assert(reg.size() == 2);
+    assert(!reg.empty());
+
+    reg.clear();
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_registry_all_kernels()
+{
+    std::cout << "  test_grouped_conv_registry_all_kernels... ";
+    GroupedConvRegistry& reg = GroupedConvRegistry::instance();
+    reg.clear();
+
+    GroupedConvKernelSet set;
+    set.add("fp16", "nhwc", "forward", 128, 128);
+    reg.register_set(set);
+
+    auto all = reg.all_kernels();
+    assert(all.size() == 1);
+    assert(all[0]->name().find("grouped_conv_") != std::string::npos);
+
+    reg.clear();
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_registry_clear()
+{
+    std::cout << "  test_grouped_conv_registry_clear... ";
+    GroupedConvRegistry& reg = GroupedConvRegistry::instance();
+    reg.clear();
+
+    GroupedConvKernelSet set;
+    set.add("fp16", "nhwc", "forward", 128, 128);
+    reg.register_set(set);
+    assert(reg.size() == 1);
+
+    reg.clear();
+    assert(reg.size() == 0);
+    assert(reg.empty());
+
+    reg.clear();
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_registry_thread_safe()
+{
+    std::cout << "  test_grouped_conv_registry_thread_safe... ";
+    GroupedConvRegistry& reg = GroupedConvRegistry::instance();
+    reg.clear();
+
+    const int num_threads     = 4;
+    const int sets_per_thread = 10;
+    std::vector<std::thread> threads;
+    std::atomic<int> success_count{0};
+
+    for(int t = 0; t < num_threads; t++)
+    {
+        threads.emplace_back([t, &reg, &success_count]() {
+            for(int k = 0; k < sets_per_thread; k++)
+            {
+                GroupedConvKernelSet set;
+                set.add("fp16", "nhwc", "forward", 128 + t * 32 + k, 128);
+                if(reg.register_set(set))
+                {
+                    success_count++;
+                }
+            }
+        });
+    }
+
+    for(auto& th : threads)
+        th.join();
+
+    assert(reg.size() == num_threads * sets_per_thread);
+    assert(success_count.load() == num_threads * sets_per_thread);
+
+    reg.clear();
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_registry_export_json()
+{
+    std::cout << "  test_grouped_conv_registry_export_json... ";
+    GroupedConvRegistry& reg = GroupedConvRegistry::instance();
+    reg.clear();
+
+    GroupedConvKernelSet set;
+    set.add("fp16", "nhwc", "forward", 128, 128);
+    reg.register_set(set);
+
+    std::string json = reg.export_json(false);
+    assert(!json.empty());
+    assert(json.find("\"kernels\"") != std::string::npos);
+    assert(json.find("\"metadata\"") != std::string::npos);
+    assert(json.find("grouped_conv_") != std::string::npos);
+
+    std::string json_stats = reg.export_json(true);
+    assert(json_stats.find("\"statistics\"") != std::string::npos);
+
+    reg.clear();
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_registry_filter()
+{
+    std::cout << "  test_grouped_conv_registry_filter... ";
+    GroupedConvRegistry& reg = GroupedConvRegistry::instance();
+    reg.clear();
+
+    GroupedConvKernelSet set;
+    set.add("fp16", "nhwc", "forward", 128, 128);
+    set.add("fp16", "nhwc", "forward", 256, 256);
+    set.add("bf16", "nhwc", "forward", 128, 128);
+    reg.register_set(set);
+
+    auto fp16_only =
+        reg.filter([](const GroupedConvKernelInstance& k) { return k.key().dtype_in == "fp16"; });
+    assert(fp16_only.size() == 2);
+
+    auto large_tile = reg.filter([](const GroupedConvKernelInstance& k) {
+        return k.key().tile_m >= 256 || k.key().tile_n >= 256;
+    });
+    assert(large_tile.size() >= 1);
+
+    reg.clear();
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_dispatcher_basic()
+{
+    std::cout << "  test_grouped_conv_dispatcher_basic... ";
+    GroupedConvRegistry& reg = GroupedConvRegistry::instance();
+    reg.clear();
+
+    GroupedConvKernelSet set;
+    set.add("fp16", "nhwc", "forward", 128, 128);
+    reg.register_set(set);
+
+    GroupedConvDispatcher dispatcher(&reg);
+    GroupedConvProblem problem = grouped_conv_utils::create_grouped_conv2d_problem(
+        4, 64, 128, 28, 28, 3, 3, 1, 1, GroupedConvOp::Forward);
+
+    float time = dispatcher.run(problem, nullptr);
+    assert(time >= 0.0f);
+
+    reg.clear();
+    std::cout << "PASSED\n";
+}
+
+void test_grouped_conv_dispatcher_select()
+{
+    std::cout << "  test_grouped_conv_dispatcher_select... ";
+    GroupedConvRegistry& reg = GroupedConvRegistry::instance();
+    reg.clear();
+
+    GroupedConvKernelSet set;
+    set.add("fp16", "nhwc", "forward", 128, 128);
+    set.add("fp16", "nhwc", "forward", 256, 256);
+    reg.register_set(set);
+
+    GroupedConvDispatcher dispatcher(&reg);
+    GroupedConvProblem problem = grouped_conv_utils::create_grouped_conv2d_problem(
+        4, 64, 128, 28, 28, 3, 3, 1, 1, GroupedConvOp::Forward);
+
+    const auto* selected = dispatcher.select(problem);
+    assert(selected != nullptr);
+    assert(selected->name().find("grouped_conv_") != std::string::npos);
+    assert(selected->matches(problem));
+
+    reg.clear();
+    std::cout << "PASSED\n";
+}
+
+int main()
+{
+    std::cout << "\n=== Test Grouped Conv Registry ===\n\n";
+    test_grouped_conv_registry_basic();
+    test_grouped_conv_registry_register_set();
+    test_grouped_conv_registry_all_kernels();
+    test_grouped_conv_registry_clear();
+    test_grouped_conv_registry_thread_safe();
+    test_grouped_conv_registry_export_json();
+    test_grouped_conv_registry_filter();
+    test_grouped_conv_dispatcher_basic();
+    test_grouped_conv_dispatcher_select();
+    std::cout << "\n=== All Tests Passed! ===\n\n";
+    return 0;
+}
diff --git a/dispatcher/tests/test_grouped_conv_utils.py b/dispatcher/tests/test_grouped_conv_utils.py
new file mode 100644
index 0000000000..9d0638dc08
--- /dev/null
+++ b/dispatcher/tests/test_grouped_conv_utils.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+TDD tests for python/grouped_conv_utils.py -- grouped convolution Python utilities.
+
+Phase 1 TDD: tests written BEFORE implementation exists.
+Run: python3 -m pytest tests/test_grouped_conv_utils.py -v
+"""
+
+import sys
+import unittest
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DISPATCHER_DIR = SCRIPT_DIR.parent
+sys.path.insert(0, str(DISPATCHER_DIR / "python"))
+sys.path.insert(0, str(DISPATCHER_DIR / "codegen"))
+
+from dispatcher_common import ValidationResultBase  # noqa: E402
+from grouped_conv_utils import (  # noqa: E402
+    GroupedConvValidationResult,
+    validate_grouped_conv_config,
+    auto_correct_grouped_conv_config,
+    get_grouped_conv_default_config,
+    GroupedConvDataType,
+    format_grouped_conv_summary,
+)
+
+
+# =============================================================================
+# VALID CONFIG FIXTURES
+# =============================================================================
+
+
+def make_valid_grouped_conv_config():
+    """Return a valid grouped conv config dict for gfx942."""
+    return {
+        "tile_config": {
+            "tile_k": 128,
+            "tile_c": 128,
+            "wave_m": 2,
+            "wave_n": 2,
+            "wave_k": 1,
+            "warp_m": 32,
+            "warp_n": 32,
+            "warp_k": 16,
+        },
+        "trait_config": {
+            "pipeline": "compv4",
+            "epilogue": "cshuffle",
+            "scheduler": "intrawave",
+        },
+        "variant": "2d_fwd",
+        "ndim_spatial": 2,
+        "arch": "gfx942",
+        "layout": "nhwgc",
+        "dtype": "fp16",
+    }
+
+
+# =============================================================================
+# TestGroupedConvValidationResult
+# =============================================================================
+
+
+class TestGroupedConvValidationResult(unittest.TestCase):
+    """Tests for GroupedConvValidationResult dataclass."""
+
+    def test_inherits_from_validation_result_base(self):
+        """GroupedConvValidationResult should inherit from ValidationResultBase."""
+        self.assertTrue(
+            issubclass(GroupedConvValidationResult, ValidationResultBase),
+            "GroupedConvValidationResult must inherit from ValidationResultBase",
+        )
+
+    def test_valid_result_has_is_valid(self):
+        """Valid result has is_valid=True."""
+        vr = GroupedConvValidationResult(is_valid=True)
+        self.assertTrue(vr.is_valid)
+
+    def test_invalid_result_has_is_valid_false(self):
+        """Invalid result has is_valid=False."""
+        vr = GroupedConvValidationResult(is_valid=False, errors=["bad config"])
+        self.assertFalse(vr.is_valid)
+
+    def test_has_errors_list(self):
+        """Result has errors list."""
+        vr = GroupedConvValidationResult(
+            is_valid=False,
+            errors=["invalid wave", "invalid trait"],
+        )
+        self.assertEqual(len(vr.errors), 2)
+        self.assertIn("invalid wave", vr.errors)
+        self.assertIn("invalid trait", vr.errors)
+
+    def test_has_warnings_list(self):
+        """Result has warnings list."""
+        vr = GroupedConvValidationResult(
+            is_valid=True,
+            warnings=["deprecated option"],
+        )
+        self.assertEqual(len(vr.warnings), 1)
+        self.assertIn("deprecated option", vr.warnings)
+
+    def test_has_suggested_fixes_dict(self):
+        """Result has suggested_fixes dict."""
+        vr = GroupedConvValidationResult(
+            is_valid=False,
+            suggested_fixes={"wave_m": 2, "wave_n": 2},
+        )
+        self.assertIn("wave_m", vr.suggested_fixes)
+        self.assertEqual(vr.suggested_fixes["wave_m"], 2)
+        self.assertIn("wave_n", vr.suggested_fixes)
+        self.assertEqual(vr.suggested_fixes["wave_n"], 2)
+
+    def test_default_empty_errors_warnings_fixes(self):
+        """Default result has empty errors, warnings, suggested_fixes."""
+        vr = GroupedConvValidationResult(is_valid=True)
+        self.assertEqual(vr.errors, [])
+        self.assertEqual(vr.warnings, [])
+        self.assertEqual(vr.suggested_fixes, {})
+
+
+# =============================================================================
+# TestValidateGroupedConvConfig
+# =============================================================================
+
+
+class TestValidateGroupedConvConfig(unittest.TestCase):
+    """Tests for validate_grouped_conv_config."""
+
+    def test_valid_config_passes(self):
+        """Valid config should pass validation."""
+        config = make_valid_grouped_conv_config()
+        result = validate_grouped_conv_config(config)
+        self.assertTrue(result.is_valid, f"Expected valid, got errors: {result.errors}")
+        self.assertEqual(result.errors, [])
+
+    def test_invalid_wave_config_fails(self):
+        """Invalid wave config should fail validation."""
+        config = make_valid_grouped_conv_config()
+        config["tile_config"]["wave_m"] = 3
+        config["tile_config"]["wave_n"] = 3
+        result = validate_grouped_conv_config(config)
+        self.assertFalse(result.is_valid)
+        self.assertGreater(len(result.errors), 0)
+        error_str = " ".join(result.errors).lower()
+        self.assertIn("wave", error_str)
+
+    def test_invalid_trait_fails(self):
+        """Invalid trait combination should fail validation."""
+        config = make_valid_grouped_conv_config()
+        config["trait_config"]["pipeline"] = "compv4"
+        config["trait_config"]["epilogue"] = "cshuffle"
+        config["trait_config"]["scheduler"] = "interwave"  # Invalid combo
+        result = validate_grouped_conv_config(config)
+        self.assertFalse(result.is_valid)
+        self.assertGreater(len(result.errors), 0)
+        error_str = " ".join(result.errors).lower()
+        self.assertIn("trait", error_str)
+
+    def test_missing_fields_fails(self):
+        """Config with missing required fields should fail validation."""
+        config = {"arch": "gfx942"}  # Missing tile_config, trait_config, etc.
+        result = validate_grouped_conv_config(config)
+        self.assertFalse(result.is_valid)
+        self.assertGreater(len(result.errors), 0)
+
+
+# =============================================================================
+# TestAutoCorrectGroupedConvConfig
+# =============================================================================
+
+
+class TestAutoCorrectGroupedConvConfig(unittest.TestCase):
+    """Tests for auto_correct_grouped_conv_config."""
+
+    def test_invalid_wave_gets_corrected(self):
+        """Invalid wave config should be auto-corrected."""
+        config = make_valid_grouped_conv_config()
+        config["tile_config"]["wave_m"] = 3
+        config["tile_config"]["wave_n"] = 3
+        corrected, result = auto_correct_grouped_conv_config(config)
+        self.assertIsInstance(corrected, dict)
+        self.assertIsInstance(result, GroupedConvValidationResult)
+        # Corrected wave should be valid for arch
+        wave_m = corrected.get("tile_config", {}).get("wave_m")
+        wave_n = corrected.get("tile_config", {}).get("wave_n")
+        self.assertIn(wave_m, [1, 2, 4])
+        self.assertIn(wave_n, [1, 2, 4])
+
+    def test_invalid_trait_gets_corrected(self):
+        """Invalid trait combination should be auto-corrected."""
+        config = make_valid_grouped_conv_config()
+        config["trait_config"]["scheduler"] = "interwave"
+        config["trait_config"]["pipeline"] = "compv4"
+        config["trait_config"]["epilogue"] = "cshuffle"
+        corrected, result = auto_correct_grouped_conv_config(config)
+        self.assertIsInstance(corrected, dict)
+        self.assertIsInstance(result, GroupedConvValidationResult)
+        # Scheduler should be corrected to intrawave for compv4+cshuffle
+        scheduler = corrected.get("trait_config", {}).get("scheduler")
+        self.assertEqual(scheduler, "intrawave")
+
+
+# =============================================================================
+# TestGetGroupedConvDefaultConfig
+# =============================================================================
+
+
+class TestGetGroupedConvDefaultConfig(unittest.TestCase):
+    """Tests for get_grouped_conv_default_config."""
+
+    def test_returns_config(self):
+        """Should return a GroupedConvKernelConfig (or dict via to_dict)."""
+        config = get_grouped_conv_default_config("2d_fwd")
+        # Accepts both dataclass and dict
+        d = config.to_dict() if hasattr(config, "to_dict") else config
+        self.assertIsInstance(d, dict)
+
+    def test_has_tile_config(self):
+        """Returned config has tile_config key."""
+        config = get_grouped_conv_default_config("2d_fwd")
+        d = config.to_dict() if hasattr(config, "to_dict") else config
+        self.assertIn("tile_config", d)
+        self.assertIsInstance(d["tile_config"], dict)
+
+    def test_has_trait_config(self):
+        """Returned config has trait_config key."""
+        config = get_grouped_conv_default_config("2d_fwd")
+        d = config.to_dict() if hasattr(config, "to_dict") else config
+        self.assertIn("trait_config", d)
+        self.assertIsInstance(d["trait_config"], dict)
+
+    def test_has_variant(self):
+        """Returned config has variant."""
+        config = get_grouped_conv_default_config("2d_fwd")
+        d = config.to_dict() if hasattr(config, "to_dict") else config
+        self.assertIn("variant", d)
+
+    def test_has_ndim_spatial(self):
+        """Returned config has ndim_spatial."""
+        config = get_grouped_conv_default_config("2d_fwd")
+        d = config.to_dict() if hasattr(config, "to_dict") else config
+        self.assertIn("ndim_spatial", d)
+
+    def test_has_arch(self):
+        """Returned config has arch."""
+        config = get_grouped_conv_default_config("2d_fwd")
+        d = config.to_dict() if hasattr(config, "to_dict") else config
+        self.assertIn("arch", d)
+
+    def test_has_layout(self):
+        """Returned config has layout."""
+        config = get_grouped_conv_default_config("2d_fwd")
+        d = config.to_dict() if hasattr(config, "to_dict") else config
+        self.assertIn("layout", d)
+
+
+# =============================================================================
+# TestGroupedConvDataType
+# =============================================================================
+
+
+class TestGroupedConvDataType(unittest.TestCase):
+    """Tests for GroupedConvDataType enum."""
+
+    def test_fp16_exists(self):
+        """GroupedConvDataType has FP16."""
+        self.assertIsNotNone(GroupedConvDataType.FP16)
+
+    def test_bf16_exists(self):
+        """GroupedConvDataType has BF16."""
+        self.assertIsNotNone(GroupedConvDataType.BF16)
+
+    def test_fp32_exists(self):
+        """GroupedConvDataType has FP32."""
+        self.assertIsNotNone(GroupedConvDataType.FP32)
+
+    def test_fp8_exists(self):
+        """GroupedConvDataType has FP8."""
+        self.assertIsNotNone(GroupedConvDataType.FP8)
+
+    def test_bf8_exists(self):
+        """GroupedConvDataType has BF8."""
+        self.assertIsNotNone(GroupedConvDataType.BF8)
+
+    def test_int8_exists(self):
+        """GroupedConvDataType has INT8."""
+        self.assertIsNotNone(GroupedConvDataType.INT8)
+
+    def test_enum_values_unique(self):
+        """All enum values should be unique."""
+        values = [
+            GroupedConvDataType.FP16,
+            GroupedConvDataType.BF16,
+            GroupedConvDataType.FP32,
+            GroupedConvDataType.FP8,
+            GroupedConvDataType.BF8,
+            GroupedConvDataType.INT8,
+        ]
+        self.assertEqual(len(values), len(set(values)))
+
+
+# =============================================================================
+# TestFormatGroupedConvSummary
+# =============================================================================
+
+
+class TestFormatGroupedConvSummary(unittest.TestCase):
+    """Tests for format_grouped_conv_summary."""
+
+    def test_returns_non_empty_string(self):
+        """Should return a non-empty string."""
+        config = make_valid_grouped_conv_config()
+        summary = format_grouped_conv_summary(config)
+        self.assertIsInstance(summary, str)
+        self.assertGreater(len(summary), 0)
+
+    def test_contains_key_info(self):
+        """Summary should contain key config info (variant, arch, layout, dtype)."""
+        config = make_valid_grouped_conv_config()
+        summary = format_grouped_conv_summary(config)
+        # Should mention at least some of: variant, arch, layout, dtype
+        summary_lower = summary.lower()
+        has_key_info = (
+            "2d" in summary_lower
+            or "fwd" in summary_lower
+            or "gfx" in summary_lower
+            or "nhwgc" in summary_lower
+            or "fp16" in summary_lower
+        )
+        self.assertTrue(
+            has_key_info,
+            f"Summary should contain key info, got: {summary}",
+        )
+
+    def test_empty_config_returns_something(self):
+        """Empty or minimal config should still return a string."""
+        summary = format_grouped_conv_summary({})
+        self.assertIsInstance(summary, str)
+        self.assertGreaterEqual(len(summary), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/dispatcher/tests/test_problem_extended.cpp b/dispatcher/tests/test_problem_extended.cpp
index 21ea545292..ba6068e3ee 100644
--- a/dispatcher/tests/test_problem_extended.cpp
+++ b/dispatcher/tests/test_problem_extended.cpp
@@ -19,7 +19,7 @@ class ProblemDimensionInferenceTest : public ::testing::Test
 
 TEST_F(ProblemDimensionInferenceTest, FromAB_Basic)
 {
-    // A: M×K (1024×512), B: K×N (512×2048)
+    // A: MxK (1024x512), B: KxN (512x2048)
     auto problem = Problem::from_ab(1024, 512, 512, 2048);
 
     EXPECT_EQ(problem.M, 1024);
@@ -30,7 +30,7 @@ TEST_F(ProblemDimensionInferenceTest, FromAB_Basic)
 
 TEST_F(ProblemDimensionInferenceTest, FromDimensions_Valid)
 {
-    // A: 1024×512, B: 512×2048, C: 1024×2048
+    // A: 1024x512, B: 512x2048, C: 1024x2048
     auto problem = Problem::from_dimensions(1024, 512, 512, 2048, 1024, 2048);
 
     EXPECT_EQ(problem.M, 1024);
@@ -55,7 +55,7 @@ TEST_F(ProblemDimensionInferenceTest, FromShapes_WithC)
 
 TEST_F(ProblemDimensionInferenceTest, FromShapes_TransposedA)
 {
-    // A stored as K×M (transposed)
+    // A stored as KxM (transposed)
     TensorShape A{512, 1024, true};
     TensorShape B{512, 2048, false};
     TensorShape C{1024, 2048, false};
@@ -70,7 +70,7 @@ TEST_F(ProblemDimensionInferenceTest, FromShapes_TransposedA)
 TEST_F(ProblemDimensionInferenceTest, FromShapes_TransposedB)
 {
     TensorShape A{1024, 512, false};
-    // B stored as N×K (transposed)
+    // B stored as NxK (transposed)
     TensorShape B{2048, 512, true};
     TensorShape C{1024, 2048, false};
 
diff --git a/dispatcher/tests/test_real_kernel_multi_size.cpp b/dispatcher/tests/test_real_kernel_multi_size.cpp
index f23f684631..79282da557 100644
--- a/dispatcher/tests/test_real_kernel_multi_size.cpp
+++ b/dispatcher/tests/test_real_kernel_multi_size.cpp
@@ -187,7 +187,7 @@ int main()
     for(const auto& r : results)
     {
         char size_str[32];
-        snprintf(size_str, sizeof(size_str), "%4d×%4d×%4d", r.M, r.N, r.K);
+        snprintf(size_str, sizeof(size_str), "%4dx%4dx%4d", r.M, r.N, r.K);
 
         printf("  %-14s | %9.4f | %6.2f | %7.2f%% | %s\n",
                size_str,
diff --git a/dispatcher/tests/test_real_kernel_performance.cpp b/dispatcher/tests/test_real_kernel_performance.cpp
index ff3d635968..29c7c80ac3 100644
--- a/dispatcher/tests/test_real_kernel_performance.cpp
+++ b/dispatcher/tests/test_real_kernel_performance.cpp
@@ -144,7 +144,7 @@ int main()
         all_passed  = all_passed && passed;
 
         char size_label[32];
-        snprintf(size_label, sizeof(size_label), "%s %d³", label, M);
+        snprintf(size_label, sizeof(size_label), "%s %d^3", label, M);
 
         printf("  %-9s | %9.4f | %6.2f | %9.1f | %s\n",
                size_label,
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index b7226270fc..c4c69cb751 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -209,7 +209,7 @@ referencing==0.37.0
     # via
     #   jsonschema
     #   jsonschema-specifications
-requests==2.32.5
+requests==2.33.0
     # via
     #   pygithub
     #   sphinx
diff --git a/example/26_contraction/common_instances.hpp b/example/26_contraction/common_instances.hpp
index 457bae21aa..808c548042 100644
--- a/example/26_contraction/common_instances.hpp
+++ b/example/26_contraction/common_instances.hpp
@@ -194,3 +194,35 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device::
         //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
         DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>;
 // clang-format on
+
+// Macro to instantiate all four layout variants of DeviceOpInstance.
+//
+// BASE:   Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes)
+// SUFFIX: NN for bilinear (DsDataType = Tuple<DDataType>),
+//         N  for scale    (DsDataType = Tuple<>)
+//
+// Requires these names to be defined in the calling TU before invocation:
+//   NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType,
+//   CShuffleDataType, DsDataType, EDataType, ComputeDataType,
+//   AElementOp, BElementOp, CDEElementOp
+//
+// Example: CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
+//   expands to DeviceOpInstanceKKNN, DeviceOpInstanceKNNN,
+//              DeviceOpInstanceMKNN, DeviceOpInstanceMNNN,
+//   and sets DeviceOpInstance = DeviceOpInstanceKKNN.
+// clang-format off
+#define CK_CONTRACTION_DEVICE_OP_INSTANCES(BASE, SUFFIX)                                          \
+    using DeviceOpInstanceKK##SUFFIX = DeviceOpInstanceKK_##BASE<NumDimM, NumDimN, NumDimK,       \
+        ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,               \
+        ComputeDataType, AElementOp, BElementOp, CDEElementOp>;                                   \
+    using DeviceOpInstanceKN##SUFFIX = DeviceOpInstanceKN_##BASE<NumDimM, NumDimN, NumDimK,       \
+        ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,               \
+        ComputeDataType, AElementOp, BElementOp, CDEElementOp>;                                   \
+    using DeviceOpInstanceMK##SUFFIX = DeviceOpInstanceMK_##BASE<NumDimM, NumDimN, NumDimK,       \
+        ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,               \
+        ComputeDataType, AElementOp, BElementOp, CDEElementOp>;                                   \
+    using DeviceOpInstanceMN##SUFFIX = DeviceOpInstanceMN_##BASE<NumDimM, NumDimN, NumDimK,       \
+        ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,               \
+        ComputeDataType, AElementOp, BElementOp, CDEElementOp>;                                   \
+    using DeviceOpInstance = DeviceOpInstanceKK##SUFFIX
+// clang-format on
diff --git a/example/26_contraction/contraction_bilinear_xdl_bf16.cpp b/example/26_contraction/contraction_bilinear_xdl_bf16.cpp
index 8899b54fbf..b5758ed428 100644
--- a/example/26_contraction/contraction_bilinear_xdl_bf16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_bf16.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
 
 #include "run_contraction_bilinear_example.inc"
 
diff --git a/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp
index 2dac449e99..be03613bd1 100644
--- a/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
 
 #include "run_contraction_bilinear_example.inc"
 
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp16.cpp b/example/26_contraction/contraction_bilinear_xdl_fp16.cpp
index 16e33e0886..5d6d401836 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp16.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
 
 #include "run_contraction_bilinear_example.inc"
 
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp
index 494670bcca..ded63dec25 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
 
 #include "run_contraction_bilinear_example.inc"
 
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
index e960199fc3..8779e1fab9 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
 
 #include "run_contraction_bilinear_example.inc"
 
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp
index 2963152eb1..467672986e 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
 
 #include "run_contraction_bilinear_example.inc"
 
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp
index 01966960cc..dff5a0446a 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
 
 #include "run_contraction_bilinear_example.inc"
 
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
index 1ea9bcedfd..2d697f3e07 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, NN);
 
 #include "run_contraction_bilinear_example.inc"
 
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp
index 9e40e28485..341dad6d5b 100644
--- a/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, NN);
 
 #include "run_contraction_bilinear_example.inc"
 
diff --git a/example/26_contraction/contraction_scale_xdl_bf16.cpp b/example/26_contraction/contraction_scale_xdl_bf16.cpp
index 586b022397..003bc0274a 100644
--- a/example/26_contraction/contraction_scale_xdl_bf16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_bf16.cpp
@@ -22,63 +22,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Scale;
 
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
 
 #include "run_contraction_scale_example.inc"
 
diff --git a/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp b/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp
index 9e4a02967a..bada39204e 100644
--- a/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp
@@ -22,63 +22,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Scale;
 
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
 
 #include "run_contraction_scale_example.inc"
 
diff --git a/example/26_contraction/contraction_scale_xdl_fp16.cpp b/example/26_contraction/contraction_scale_xdl_fp16.cpp
index 1f29e16223..4f3adef47a 100644
--- a/example/26_contraction/contraction_scale_xdl_fp16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp16.cpp
@@ -22,63 +22,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Scale;
 
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
 
 #include "run_contraction_scale_example.inc"
 
diff --git a/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp
index 878011afd1..9be3b616f6 100644
--- a/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp
@@ -22,63 +22,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Scale;
 
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
 
 #include "run_contraction_scale_example.inc"
 
diff --git a/example/26_contraction/contraction_scale_xdl_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp32.cpp
index 5d8aa7b9c5..d7754ef546 100644
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -22,63 +22,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Scale;
 
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
 
 #include "run_contraction_scale_example.inc"
 
diff --git a/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp b/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
index 57b1052a83..deaf7e7bdc 100644
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
@@ -22,63 +22,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Scale;
 
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
 
 #include "run_contraction_scale_example.inc"
 
diff --git a/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp b/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
index ae23986bc9..de52096712 100644
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
@@ -22,63 +22,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Scale;
 
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
 
 #include "run_contraction_scale_example.inc"
 
diff --git a/example/26_contraction/contraction_scale_xdl_fp64.cpp b/example/26_contraction/contraction_scale_xdl_fp64.cpp
index 66f22ce63c..3d5d23968f 100644
--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
@@ -22,63 +22,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Scale;
 
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, N);
 
 #include "run_contraction_scale_example.inc"
 
diff --git a/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
index 2d72be8157..ee2533ca0a 100644
--- a/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
@@ -22,63 +22,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Scale;
 
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, N);
 
 #include "run_contraction_scale_example.inc"
 
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
index b0b2d29d98..2ceca3c877 100644
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
@@ -238,16 +238,6 @@ int main(int argc, char* argv[])
 
         Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
 
-#if 0
-        for(int n = 0; n < N; ++n)
-        {
-            for(int k = 0; k < K; ++k)
-            {
-                b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
-            }
-        }
-#endif
-
         using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
                                                                                 B0DataType,
                                                                                 CShuffleDataType,
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index 9f4cd13573..008edb533b 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -252,9 +252,6 @@ int main(int argc, char* argv[])
     Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
     Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
-    // max_token_id.mData[0] = valid_size;
-    // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
-    // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
     max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
     // int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
     for(int i = 0; i < sorted_tile_num; i++)
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
index 552d3cd7b5..8ae97ef1c2 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
@@ -261,16 +261,6 @@ int main(int argc, char* argv[])
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
 
     max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
-    // int eids[]         = {0, 1, 3, 3, 3};
-    //  int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7}; //, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
-    // int eids[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3};
-    // int eids[]         = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    //                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    //                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    //                     3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
-    //                     5, 5, 5, 5, 6, 6, 6, 6, 7, 7,
-    //                     7, 7,
-    //                     3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
         expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
diff --git a/example/66_complex_contraction_bilinear/common_instances.hpp b/example/66_complex_contraction_bilinear/common_instances.hpp
index cb6157b29b..3ae168cb72 100644
--- a/example/66_complex_contraction_bilinear/common_instances.hpp
+++ b/example/66_complex_contraction_bilinear/common_instances.hpp
@@ -194,3 +194,35 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device::
         //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
         DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>;
 // clang-format on
+
+// Macro to instantiate all four layout variants of DeviceOpInstance.
+//
+// BASE:   Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes)
+// SUFFIX: NN for bilinear (DsDataType = Tuple<DDataType>),
+//         N  for scale    (DsDataType = Tuple<>)
+//
+// Requires these names to be defined in the calling TU before invocation:
+//   NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType,
+//   CShuffleDataType, DsDataType, EDataType, ComputeDataType,
+//   AElementOp, BElementOp, CDEElementOp
+//
+// Example: CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
+//   expands to DeviceOpInstanceKKNN, DeviceOpInstanceKNNN,
+//              DeviceOpInstanceMKNN, DeviceOpInstanceMNNN,
+//   and sets DeviceOpInstance = DeviceOpInstanceKKNN.
+// clang-format off
+#define CK_CONTRACTION_DEVICE_OP_INSTANCES(BASE, SUFFIX)                                          \
+    using DeviceOpInstanceKK##SUFFIX = DeviceOpInstanceKK_##BASE<NumDimM, NumDimN, NumDimK,       \
+        ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,               \
+        ComputeDataType, AElementOp, BElementOp, CDEElementOp>;                                   \
+    using DeviceOpInstanceKN##SUFFIX = DeviceOpInstanceKN_##BASE<NumDimM, NumDimN, NumDimK,       \
+        ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,               \
+        ComputeDataType, AElementOp, BElementOp, CDEElementOp>;                                   \
+    using DeviceOpInstanceMK##SUFFIX = DeviceOpInstanceMK_##BASE<NumDimM, NumDimN, NumDimK,       \
+        ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,               \
+        ComputeDataType, AElementOp, BElementOp, CDEElementOp>;                                   \
+    using DeviceOpInstanceMN##SUFFIX = DeviceOpInstanceMN_##BASE<NumDimM, NumDimN, NumDimK,       \
+        ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,               \
+        ComputeDataType, AElementOp, BElementOp, CDEElementOp>;                                   \
+    using DeviceOpInstance = DeviceOpInstanceKK##SUFFIX
+// clang-format on
diff --git a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
index e2cae7a1f8..7533281f1a 100644
--- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
+++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
 
 #include "run_complex_contraction_bilinear_example.inc"
 
diff --git a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
index a2021b5eaa..a41e1f1785 100644
--- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
+++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
@@ -23,63 +23,9 @@ using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
 
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-
-using DeviceOpInstance = DeviceOpInstanceKKNN;
+// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
+// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
+CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, NN);
 
 #include "run_complex_contraction_bilinear_example.inc"
 
diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index e9ae11fb5f..79fe6492a6 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -139,6 +139,7 @@ LAYOUT_MAP = {"row": "true", "col": "false"}
 
 PIPELINE_MAP = {
     "qr": "ck_tile::BlockFmhaPipelineQRKSVS",
+    "qr_hpad": "ck_tile::BlockFmhaPipelineQRKSVSHpad",
     "qr_async": "ck_tile::BlockFmhaPipelineQRKSVSAsync",
     "qs": "ck_tile::BlockFmhaPipelineQSKSVS",
     "qr_async_trload": "ck_tile::BlockFmhaPipelineQRKSVSAsyncTrload",
@@ -147,6 +148,7 @@ PIPELINE_MAP = {
 
 PIPELINE_ENUM_MAP = {
     "qr": "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qr_hpad": "ck_tile::BlockFmhaPipelineEnum::QRKSVS_HPAD",
     "qr_async": "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC",
     "qr_nwarp_sshuffle": "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
     "qs": "ck_tile::BlockFmhaPipelineEnum::QSKSVS",
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
index f172bb6ab6..35e8c1be49 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -635,6 +635,7 @@ class KernelComponentFactory:
         elif dtype in ["fp8bf16"]:
             return {
                 128 : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
+                256 : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
             }  # fmt: skip
         else:
             return None
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 6739abf621..7105f1aa5c 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -533,6 +533,7 @@ using fmha_bwd_dot_do_o_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdOGradDot
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::OGradDataType,
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::DDataType,
+    typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
     /* BlockSize = M0 = */ {F_bm0},
     {F_hdim},
     {F_mode},
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 1849068161..c64a19104e 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -60,6 +60,22 @@ FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 #include "fmha_fwd.hpp"
 """
 
+FMHA_FWD_KERNEL_HEADER_QR_HPAD = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.\n
+// auto generated by generate.py
+#if defined(__HIP_DEVICE_COMPILE__) && \
+    (defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+     defined(__gfx1103__) || defined(__gfx1150__) || defined(__gfx1151__) || \
+     defined(__gfx1152__) || defined(__gfx1153__) || defined(__gfx11_generic__) || \
+     defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__))
+#if !defined(CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK)
+#define CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#endif
+#endif
+#include "ck_tile/ops/fmha/block/variants.hpp"
+#include "fmha_fwd.hpp"
+"""
+
 FMHA_FWD_KERNEL_BODY_TEMPLATE = """
 #include <iostream>
 
@@ -206,22 +222,14 @@ float {F_func_name}([[maybe_unused]] fmha_fwd_traits t, [[maybe_unused]] fmha_fw
 """
 FMHA_FWD_API_FOOTER_TEMPLATE = """
 float fmha_fwd(fmha_fwd_traits traits, fmha_fwd_args args, const ck_tile::stream_config& config) {{
-    const std::string device_name = ck_tile::get_device_name();
-
-    const bool is_swa = (traits.mask_type != mask_enum::no_mask) and
-                        ((0 < args.window_size_left) or (0 < args.window_size_right));
-    const bool can_dispatch_v3 =
-        (device_name.compare(0, 6, "gfx950") == 0) and
-        (traits.data_type.compare("fp16") == 0 or traits.data_type.compare("bf16") == 0) and
-        traits.is_v_rowmajor and (traits.bias_type == bias_enum::no_bias) and
-        (not traits.has_lse) and (not traits.has_dropout) and
-        (traits.qscale_type == quant_scale_enum::no_scale) and (not is_swa) and
-        (args.nhead_q % args.nhead_k == 0) and (args.hdim_q == 128) and (args.hdim_v == 128);
-    if ({F_is_v3_enabled} and can_dispatch_v3) {{
-        return fmha_fwd_v3(traits, args, config);
-    }} else {{
-        return fmha_fwd_v2(traits, args, config);
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunreachable-code"
+    if ({F_is_v3_enabled}) {{
+        float r = fmha_fwd_v3(traits, args, config);
+        if (r >= 0) return r;
     }}
+#pragma clang diagnostic pop
+    return fmha_fwd_v2(traits, args, config);
 }}
 """
 
@@ -308,7 +316,7 @@ class FmhaFwdApiTrait:
                 return "true"  # always support
             else:
                 return "true"
-        elif self.pipeline_tag in ["qr", "qs"]:
+        elif self.pipeline_tag in ["qr", "qr_hpad", "qs"]:
             if self.spad == "t":
                 return f"true /*a.seqlen_q % {self.bm0} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
             else:
@@ -331,7 +339,7 @@ class FmhaFwdApiTrait:
                 return f"(a.cu_seqlen_k_ptr != nullptr) || (a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0)"
             else:
                 return f"(a.cu_seqlen_k_ptr == nullptr) && (a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0)"
-        elif self.pipeline_tag in ["qr", "qs"]:
+        elif self.pipeline_tag in ["qr", "qr_hpad", "qs"]:
             if self.skpad == "t":
                 return f"true /*a.seqlen_k % {self.bn0} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
             else:
@@ -352,6 +360,11 @@ class FmhaFwdApiTrait:
                 return f"a.hdim_q % {vec} == 0"
             else:
                 assert False
+        elif self.pipeline_tag == "qr_hpad":
+            if self.dpad == "t":
+                return "a.hdim_q % 8 == 0"
+            else:
+                assert False
         elif self.pipeline_tag in ["qr", "qs", "qr_async_trload", "qr_async_trload_v3"]:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dpad == "t":
@@ -369,6 +382,11 @@ class FmhaFwdApiTrait:
                 return f"a.hdim_v % {vec} == 0"
             else:
                 assert False
+        elif self.pipeline_tag == "qr_hpad":
+            if self.dvpad == "t":
+                return "a.hdim_v % 8 == 0"
+            else:
+                assert False
         elif self.pipeline_tag in ["qr", "qs", "qr_async_trload", "qr_async_trload_v3"]:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dvpad == "t":
@@ -642,6 +660,7 @@ class FmhaFwdKernel:
     F_pipeline: FmhaFwdPipeline
 
     _KERNEL_HEADER: ClassVar[str] = FMHA_FWD_KERNEL_HEADER
+    _KERNEL_HEADER_QR_HPAD: ClassVar[str] = FMHA_FWD_KERNEL_HEADER_QR_HPAD
     _KERNEL_BODY_TEMPLATE: ClassVar[str] = FMHA_FWD_KERNEL_BODY_TEMPLATE
 
     @classmethod
@@ -651,6 +670,12 @@ class FmhaFwdKernel:
         else:
             return "ck_tile::FmhaFwdKernel"
 
+    @classmethod
+    def _get_kernel_header(cls, pipeline_tag):
+        if pipeline_tag == "qr_hpad":
+            return cls._KERNEL_HEADER_QR_HPAD
+        return cls._KERNEL_HEADER
+
     @classmethod
     def _get_cpp_kargs_creator_func_name(cls, pipeline_tag):
         if pipeline_tag == "qr_async_trload_v3":
@@ -659,7 +684,9 @@ class FmhaFwdKernel:
             return "fmha_fwd_create_kargs_and_grids"
 
     def render(self) -> str:
-        return type(self)._KERNEL_HEADER + type(self)._KERNEL_BODY_TEMPLATE.format(
+        return type(self)._get_kernel_header(self.F_pipeline.tag) + type(
+            self
+        )._KERNEL_BODY_TEMPLATE.format(
             F_kname=self.name,
             F_arch=self.F_arch,
             F_hdim=self.F_hdim,
@@ -1059,10 +1086,11 @@ class KernelComponentFactoryGfx950(
     def get_hdim_tile_size_dict(cls, dtype: str) -> Optional[dict]:
         result = KernelComponentFactoryGfx9.get_hdim_tile_size_dict(dtype)
         if dtype in cls._DT_FP16_BF16:
-            # add tile for qr_async_trload_v3
-            if (128, 128) in result.keys():
-                result[(128, 128)].append(
-                    FmhaFwdTileSize(256, 32, 128, 128, 32, 128,  8, 1, 1,  8, 1, 1,  32, 32, 16,  32, 32, 16,  -1))  # fmt: skip
+            # # add tile for qr_async_trload_v3 (bf16/fp16 V3 not ready)
+            # if (128, 128) in result.keys():
+            #     result[(128, 128)].append(
+            #         FmhaFwdTileSize(256, 32, 128, 128, 32, 128,  8, 1, 1,  8, 1, 1,  32, 32, 16,  32, 32, 16,  -1))  # fmt: skip
+            pass
         elif dtype in cls._DT_MXFP8:
             return {
                 #                             bm0, bn0, bk0, bn1, bk1,
@@ -1075,6 +1103,10 @@ class KernelComponentFactoryGfx950(
                 (128, 128) : [FmhaFwdTileSize(128, 128,  64, 128,  64, 128,  4, 1, 1,  4, 1, 1,  32, 32,  64,  32, 32,  64,  -1)],
                 (256, 256) : [FmhaFwdTileSize(128, 128, 128, 256, 128, 256,  4, 1, 1,  4, 1, 1,  16, 16, 128,  16, 16, 128,  -1)],
             }  # fmt: skip
+        elif dtype in cls._DT_FP8BF16:
+            if (128, 128) in result.keys():
+                result[(128, 128)].append(
+                    FmhaFwdTileSize(256, 64, 128, 128, 64, 128,  8, 1, 1,  8, 1, 1,  32, 32, 32,  32, 32, 32,  -1))  # fmt: skip
         return result
 
     @classmethod
@@ -1105,12 +1137,19 @@ class KernelComponentFactoryGfx950(
                     pipelines.append(FmhaFwdPipeline("qr_async_trload", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "t", sink))  # fmt: skip
                     pipelines.append(FmhaFwdPipeline("qr_async_trload", "row", "f", "f", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "t", sink))  # fmt: skip
 
-            # qr_async_trload_v3 only supports hdim=hdim_v=128 for now
-            if (hdim, hdim_v) == (128, 128):
-                # qr_async_trload_v3 only supports (generic) causal mask
-                for logits, mask in itertools.product(["t", "f"], ["no", "causal"]):
-                    pipelines.append(FmhaFwdPipeline("qr_async_trload_v3", "row", "t", "t", "f", "f",
-                        F_logits=logits, F_bias="no", F_lse="f", F_dropout="f", F_qscale=qscale, F_mask=mask, F_skip="f", F_trload="t", F_sink="f"))  # fmt: skip
+            # # qr_async_trload_v3 bf16/fp16 not ready
+            # if (hdim, hdim_v) == (128, 128):
+            #     for logits, mask in itertools.product(["t", "f"], ["no", "causal"]):
+            #         pipelines.append(FmhaFwdPipeline("qr_async_trload_v3", "row", "t", "t", "f", "f",
+            #             F_logits=logits, F_bias="no", F_lse="f", F_dropout="f", F_qscale=qscale, F_mask=mask, F_skip="f", F_trload="t", F_sink="f"))  # fmt: skip
+        elif dtype in cls._DT_FP8BF16:
+            # qr_async_trload_v3 only supports (generic) causal mask
+            for logits, qscale, mask in itertools.product(
+                ["t", "f"],
+                ["no", "pertensor"],
+                ["no", "causal"],
+            ):
+                pipelines.append(FmhaFwdPipeline("qr_async_trload_v3", "row", "t", "t", "f", "f", F_logits=logits, F_bias="no", F_lse="f", F_dropout="f", F_qscale=qscale, F_mask=mask, F_skip="f", F_trload="t", F_sink="f"))  # fmt: skip
 
         elif dtype in cls._DT_MXFP8 or dtype in cls._DT_MXFP4:
             # no need dropout kernels
@@ -1140,6 +1179,37 @@ class KernelComponentFactoryGfx11(CompatibilityRuleFactory):
     def supported_dtypes(cls) -> Tuple[str]:
         return cls._DT_FP16_BF16
 
+    @classmethod
+    def get_rules(cls) -> List[CompatibilityRule]:
+        rules = super().get_rules()
+
+        # For gfx11 fp16/bf16 d128, use dpad=dvpad=t for the 64x32 tile:
+        # the exact-hdim variant (dpad=dvpad=f) is much slower here.
+        def check_d128_tile_pipeline(
+            problem_ctx: ProblemContext, kernel_ctx: KernelContext
+        ) -> bool:
+            if problem_ctx.dtype not in cls._DT_FP16_BF16:
+                return True
+
+            if (problem_ctx.hdim, problem_ctx.hdim_v) != (128, 128):
+                return True
+
+            is_64x32_tile = kernel_ctx.tile.F_bm0 == 64 and kernel_ctx.tile.F_bn0 == 32
+            pads_hdim = (
+                kernel_ctx.pipeline.F_dpad == "t" and kernel_ctx.pipeline.F_dvpad == "t"
+            )
+            exact_hdim = (
+                kernel_ctx.pipeline.F_dpad == "f" and kernel_ctx.pipeline.F_dvpad == "f"
+            )
+
+            if is_64x32_tile:
+                return pads_hdim
+
+            return exact_hdim
+
+        rules.append(check_d128_tile_pipeline)
+        return rules
+
     @classmethod
     def get_hdim_tile_size_dict(cls, dtype: str) -> Optional[dict]:
         if dtype in cls._DT_FP16_BF16:
@@ -1148,7 +1218,8 @@ class KernelComponentFactoryGfx11(CompatibilityRuleFactory):
                 ( 32,  32) : [FmhaFwdTileSize( 64,  64,  16,  32,  32,   32,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                 ( 64,  64) : [FmhaFwdTileSize( 64,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q < 4096")),
                               FmhaFwdTileSize(128,  64,  32,  64,  32,   64,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
-                (128, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q < 4096")),
+                (128, 128) : [FmhaFwdTileSize( 64,  32,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,   6, CppConstraint("a.hdim_q != 128 || a.hdim_v != 128")),
+                              FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q < 4096")),
                               FmhaFwdTileSize(128,  64,  32, 128,  32,  128,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,   6)],
                 (192, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                 (256, 256) : [FmhaFwdTileSize(128,  64,  32, 256,  32,  256,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,   6)]
@@ -1175,7 +1246,9 @@ class KernelComponentFactoryGfx11(CompatibilityRuleFactory):
                 # Keep only ttff/tttt for gfx11: ffff path is often similar or worse
                 # pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
                 pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_hpad", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                if receipt == 1:
+                    pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
         return pipelines
 
 
@@ -1209,7 +1282,8 @@ class KernelComponentFactoryGfx12(CompatibilityRuleFactory):
                 #                             bm0, bn0, bk0, bn1, bk1,
                 ( 32,  32) : [FmhaFwdTileSize( 64,  64,  16,  32,  32,   32,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                 ( 64,  64) : [FmhaFwdTileSize( 64,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
-                (128, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+                (128, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q <= 8192")),
+                              FmhaFwdTileSize(128,  64,  32, 128,  32,  128,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,   6)],
                 (192, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                 (256, 256) : [FmhaFwdTileSize( 64,  64,  32, 256,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
             }  # fmt: skip
@@ -1244,9 +1318,11 @@ class KernelComponentFactoryGfx12(CompatibilityRuleFactory):
                 ["t", "f"],
                 ["t", "f"],
             ):
-                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                # pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
                 pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
-                pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_hpad", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                if receipt == 1:
+                    pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
         elif dtype in cls._DT_FP8_FP8BF16 or dtype in cls._DT_FP8FP32:
             # no need lse/dropout kernels
             for logits, qscale, mask, bias in itertools.product(
@@ -1303,7 +1379,23 @@ class Product:
 
 def get_product(receipt: int) -> Product:
     # Flash attention integration
-    if receipt in (2, 3):
+    if receipt == 2:
+
+        def fit(problem_ctx: ProblemContext, kernel_ctx: KernelContext) -> bool:
+            cond = problem_ctx.dtype in ["fp16", "bf16"]
+            cond &= kernel_ctx.pipeline.F_vlayout == "row"
+            cond &= kernel_ctx.pipeline.F_bias in ["no", "alibi"]
+            cond &= kernel_ctx.pipeline.F_qscale == "no"
+            cond &= kernel_ctx.pipeline.F_skip == "f"
+            cond &= kernel_ctx.pipeline.F_sink == "f"
+            # FlashAttention direct fwd wrappers always use softcap disabled and LSE enabled.
+            cond &= kernel_ctx.pipeline.F_logits == "f"
+            cond &= kernel_ctx.pipeline.F_lse == "t"
+            return cond
+
+        return Product(name="Flash attention integration", rule=fit)
+    # Receipt 3 forward coverage used by CK library / smoke tests
+    elif receipt == 3:
 
         def fit(problem_ctx: ProblemContext, kernel_ctx: KernelContext) -> bool:
             cond = problem_ctx.dtype in ["fp16", "bf16"]
@@ -1477,8 +1569,8 @@ def write_fwd_api(
             FMHA_FWD_API_FOOTER_TEMPLATE.format(
                 F_is_v3_enabled=BOOL_MAP[
                     # NOTE: enable v3 pipelines when ready
-                    # 0 < api_pool.get_num_traits(filter_fn=accept_only_v3)
-                    False
+                    0 < api_pool.get_num_traits(filter_fn=accept_only_v3)
+                    # False
                 ]
             ),
         ]
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index e0ccde8a6b..c9bac50da1 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -939,6 +939,8 @@ def get_fwd_splitkv_blobs(
                     cond = dtype in ["fp16", "bf16"]
                     cond &= pipeline.F_vlayout == "row"
                     cond &= pipeline.F_bias in ["no", "alibi"]
+                    # FlashAttention splitkv paths use softcap-disabled kernels only.
+                    cond &= pipeline.F_logits == "f"
                     cond &= pipeline.F_squant == "f"
                     cond &= pipeline.F_sink == "f"
                     if not cond:
@@ -1142,4 +1144,7 @@ def list_blobs(
         )
         for kernel in kernels:
             f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n")
-        f.write((file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME).as_posix() + "\n")
+        f.write(
+            (file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME).as_posix()
+            + "\n"
+        )
diff --git a/example/ck_tile/01_fmha/example_fmha_bwd.cpp b/example/ck_tile/01_fmha/example_fmha_bwd.cpp
index c1f3a4fce3..bec7da0a2f 100644
--- a/example/ck_tile/01_fmha/example_fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/example_fmha_bwd.cpp
@@ -87,6 +87,7 @@ auto create_args(int argc, char* argv[])
                 "0",
                 "if set to 1 will use multi-buffer reduction strategy for dq, atomic operation "
                 "will not be used")
+        .insert("sink_grad", "0", "if set to 1, compute and validate sink token gradient")
         .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
         .insert("jsonfile", "fmha_bwd.json", "json file name to dump results");
 
@@ -122,6 +123,7 @@ auto run(const ck_tile::ArgParser& arg_parser)
     bool deterministic       = arg_parser.get_bool("deterministic");
     std::string init_method  = arg_parser.get_str("init");
     uint32_t seed            = arg_parser.get_uint32("seed");
+    bool sink_grad           = arg_parser.get_bool("sink_grad");
 
     ck_tile::stream_config stream_config{nullptr,
                                          true,
@@ -154,6 +156,7 @@ auto run(const ck_tile::ArgParser& arg_parser)
                                         drop_offset,
                                         drop_prefs,
                                         mask_str,
+                                        sink_grad,
                                         deterministic,
                                         init_method,
                                         seed,
diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index 8eb8834e12..4496a6c9dd 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -116,6 +116,9 @@ struct fmha_bwd_args
     void* dv_ptr;
     void* dbias_ptr;
     void* dq_acc_ptr;
+    const void*
+        sink_ptr; // sink scores [batch, nhead] in log-space (LSEDataType); nullptr disables sink
+    void* d_sink_ptr; // sink gradient output [nhead] (LSEDataType); nullptr disables sink gradient
 
     // Usage notes for sequence length pointer parameters:
     //
@@ -362,11 +365,15 @@ auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
             return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
                                                      args.do_ptr,
                                                      args.d_ptr,
+                                                     args.lse_ptr,
+                                                     args.sink_ptr,
+                                                     args.d_sink_ptr,
                                                      args.p_undrop,
                                                      args.seqstart_q_ptr,
                                                      args.seqlen_q_ptr,
                                                      args.cu_seqlen_q_ptr,
                                                      args.hdim_v,
+                                                     args.nhead_q,
                                                      args.stride_do,
                                                      args.stride_o,
                                                      args.nhead_stride_do,
@@ -378,9 +385,13 @@ auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
             return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
                                                      args.do_ptr,
                                                      args.d_ptr,
+                                                     args.lse_ptr,
+                                                     args.sink_ptr,
+                                                     args.d_sink_ptr,
                                                      args.p_undrop,
                                                      args.seqlen_q,
                                                      args.hdim_v,
+                                                     args.nhead_q,
                                                      args.stride_do,
                                                      args.stride_o,
                                                      args.nhead_stride_do,
diff --git a/example/ck_tile/01_fmha/fmha_bwd_runner.hpp b/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
index 3123e4f2a8..361bda20eb 100644
--- a/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
@@ -77,6 +77,7 @@ bwd_result fmha_bwd_run(mode_enum mode,
                         uint64_t drop_offset,
                         bool drop_prefs,
                         std::string mask_str,
+                        bool sink_grad, // if true, compute and validate sink gradient
                         bool deterministic,
                         std::string init_method,
                         uint32_t seed,
@@ -284,6 +285,16 @@ bwd_result fmha_bwd_run(mode_enum mode,
         get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
     ck_tile::HostTensor<LSEDataType> lse_host(
         std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
+    ck_tile::HostTensor<LSEDataType> sink_host(
+        sink_grad ? std::array<ck_tile::index_t, 2>{shape_batch, nhead}
+                  : std::array<ck_tile::index_t, 2>{1, 1} /* dummy when sink is disabled */);
+    if(sink_grad)
+    {
+        std::uniform_real_distribution<float> sink_dist(30.0f, 60.0f);
+        sink_host.ForEach([&](auto& self, auto i) {
+            self(i) = static_cast<LSEDataType>(sink_dist(random_engine));
+        });
+    }
     ck_tile::HostTensor<DDataType> d_host(
         std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
     ck_tile::HostTensor<RandValOutputDataType> randval_host(
@@ -301,6 +312,12 @@ bwd_result fmha_bwd_run(mode_enum mode,
         use_dbias
             ? get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
             : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
+    ck_tile::HostTensor<LSEDataType> d_sink_host(sink_grad ? std::array<ck_tile::index_t, 1>{nhead}
+                                                           : std::array<ck_tile::index_t, 1>{0});
+    if(sink_grad)
+    {
+        d_sink_host.ForEach([&](auto& self, auto i) { self(i) = 0; });
+    }
     ck_tile::HostTensor<AccDataType> dq_acc_host(
         std::array<ck_tile::index_t, 5>{shape_batch, nhead, nsplits, shape_seqlen_q, hdim_q});
 
@@ -361,11 +378,13 @@ bwd_result fmha_bwd_run(mode_enum mode,
     ck_tile::DeviceMem bias_buf(bias_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem lse_buf(lse_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sink_buf(sink_grad ? sink_host.get_element_space_size_in_bytes() : 0);
     ck_tile::DeviceMem d_buf(d_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem randval_buf(randval_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem dq_buf(dq_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem dk_buf(dk_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem dv_buf(dv_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_sink_buf(sink_grad ? d_sink_host.get_element_space_size_in_bytes() : 0);
     ck_tile::DeviceMem do_buf(do_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem dbias_buf(dbias_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
@@ -396,6 +415,11 @@ bwd_result fmha_bwd_run(mode_enum mode,
     drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
     drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
     alibi_slope_buf.ToDevice(alibi_slope_host.data());
+    if(sink_grad)
+    {
+        sink_buf.ToDevice(sink_host.data());
+        d_sink_buf.ToDevice(d_sink_host.data());
+    }
 
     // clang-format off
     auto layout_str = [&](bool permute){
@@ -415,7 +439,8 @@ bwd_result fmha_bwd_run(mode_enum mode,
               << "] b:" << batch << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_qs[0]
               << "/" << seqlen_ks[0] << ", d:" << hdim_q << "/" << hdim_v << ", scale:" << scale
               << ", bias:" << bias << ", dbias:" << use_dbias << ", p_drop:" << p_drop
-              << ", s_randval:" << s_randval << ", deterministic:" << deterministic
+              << (sink_grad ? ", sink:(rand[30,60], grad)" : "") << ", s_randval:" << s_randval
+              << ", deterministic:" << deterministic
               << (deterministic
                       ? std::string(", workspace:") + std::to_string(workspace_size_in_megabytes) +
                             "MiB|" + std::to_string(nsplits) + "splits"
@@ -479,7 +504,6 @@ bwd_result fmha_bwd_run(mode_enum mode,
 
         const void* seqlen_q_ptr_dev = use_qpadding ? seqlen_q_dev.GetDeviceBuffer() : nullptr;
         const void* seqlen_k_ptr_dev = use_kpadding ? seqlen_k_dev.GetDeviceBuffer() : nullptr;
-
         return fmha_bwd_args{q_buf.GetDeviceBuffer(),
                              k_buf.GetDeviceBuffer(),
                              v_buf.GetDeviceBuffer(),
@@ -495,6 +519,8 @@ bwd_result fmha_bwd_run(mode_enum mode,
                              dv_buf.GetDeviceBuffer(),
                              dbias_buf.GetDeviceBuffer(),
                              dq_acc_buf.GetDeviceBuffer(),
+                             sink_buf.GetDeviceBuffer(),
+                             d_sink_buf.GetDeviceBuffer(),
                              seqstart_q.GetDeviceBuffer(),
                              seqstart_k.GetDeviceBuffer(),
                              seqlen_q_ptr_dev,
@@ -589,6 +615,7 @@ bwd_result fmha_bwd_run(mode_enum mode,
         std::vector<ck_tile::HostTensor<RandValOutputDataType>> randval_host_refs;
         std::vector<ck_tile::HostTensor<AccDataType>> p_hp_host_refs;
         std::vector<ck_tile::HostTensor<GemmDataType>> p_lp_host_refs;
+        std::vector<ck_tile::HostTensor<AccDataType>> p_sink_host_refs;
 
         randval_buf.FromDevice(randval_host.data());
 
@@ -765,6 +792,46 @@ bwd_result fmha_bwd_run(mode_enum mode,
             ck_tile::reference_batched_softmax<AccDataType, LSEDataType, AccDataType>(
                 s_host_ref, p_hp_host_ref, ck_tile::identity{}, lse_host_ref);
 
+            // Incorporate sink token into the softmax distribution (reference computation).
+            // The sink acts as an extra key whose score is sink_host(wb, i_h) (in log-space),
+            // which is a per-head random value in [30, 60].
+            //   lse_new = log(exp(lse_old) + exp(sink))
+            //   P_new   = P_old * exp(lse_old - lse_new)   (rescaled token attention)
+            //   P_sink  = exp(sink - lse_new)               (sink attention weight)
+            ck_tile::HostTensor<AccDataType> p_sink_host_ref(
+                sink_grad ? std::array<ck_tile::index_t, 2>{nhead, real_seqlen_q}
+                          : std::array<ck_tile::index_t, 2>{0, 0});
+            if(sink_grad)
+            {
+                for(int i_h = 0; i_h < nhead; ++i_h)
+                {
+                    AccDataType sink_val = sink_host(wb, i_h);
+                    for(int i_q = 0; i_q < real_seqlen_q; ++i_q)
+                    {
+                        // Use numerically stable log-sum-exp: lse_new = log(exp(lse_old)+exp(sink))
+                        //   = max(lse_old, sink) + log(1 + exp(min - max))
+                        // This handles lse_old = -inf (fully-masked rows) without producing NaN:
+                        //   if lse_old=-inf: max=sink, min=-inf, exp(-inf-sink)=0, lse_new=sink
+                        // It also avoids exp(lse_old) overflow when lse_old is large.
+                        //   p_scale = exp(lse_old - lse_new)   [fraction kept by regular tokens]
+                        //   p_sink  = exp(sink - lse_new)      [sink attention weight]
+                        AccDataType lse_old = lse_host_ref(i_h, i_q);
+                        AccDataType hi      = lse_old > sink_val ? lse_old : sink_val;
+                        AccDataType lo      = lse_old > sink_val ? sink_val : lse_old;
+                        AccDataType lse_new =
+                            hi + ck_tile::log(AccDataType(1) + ck_tile::exp(lo - hi));
+                        AccDataType p_scale = ck_tile::exp(lse_old - lse_new);
+
+                        lse_host_ref(i_h, i_q) = lse_new;
+
+                        for(int i_k = 0; i_k < real_seqlen_k; ++i_k)
+                            p_hp_host_ref(i_h, i_q, i_k) *= p_scale;
+
+                        p_sink_host_ref(i_h, i_q) = ck_tile::exp(sink_val - lse_new);
+                    }
+                }
+            }
+
             if(p_drop > 0)
             {
                 p_dropped_hp_host_ref = p_hp_host_ref;
@@ -823,6 +890,7 @@ bwd_result fmha_bwd_run(mode_enum mode,
             o_host_refs.push_back(o_host_ref);
             p_hp_host_refs.push_back(p_hp_host_ref);
             p_lp_host_refs.push_back(p_lp_host_ref);
+            p_sink_host_refs.push_back(p_sink_host_ref);
             if(p_drop > 0)
             {
                 randval_host_refs.push_back(randval_host_ref);
@@ -842,6 +910,8 @@ bwd_result fmha_bwd_run(mode_enum mode,
         o_buf.ToDevice(o_host.data());
         lse_buf.ToDevice(lse_host.data());
         dbias_buf.SetZero();
+        if(sink_grad)
+            d_sink_buf.SetZero();
 
         if(launcher.needs_zero_dq_acc)
             dq_acc_buf.SetZero();
@@ -853,10 +923,19 @@ bwd_result fmha_bwd_run(mode_enum mode,
         dk_buf.FromDevice(dk_host.data());
         dv_buf.FromDevice(dv_host.data());
         dbias_buf.FromDevice(dbias_host.data());
+        if(sink_grad)
+            d_sink_buf.FromDevice(d_sink_host.data());
 
         // Track the index into reference vectors (may differ from wb if batches were skipped)
         ck_tile::index_t ref_idx = 0;
 
+        // validation sink accumulator: global over batch, shape [nhead]
+        ck_tile::HostTensor<AccDataType> d_sink_host_ref(
+            sink_grad ? std::array<ck_tile::index_t, 1>{nhead}
+                      : std::array<ck_tile::index_t, 1>{0});
+        if(sink_grad)
+            d_sink_host_ref.ForEach([&](auto& self, auto i) { self(i) = 0; });
+
         for(ck_tile::index_t wb = 0; wb < batch; ++wb)
         {
             // When padding is enabled, use logical lengths instead of computing from padded
@@ -932,6 +1011,30 @@ bwd_result fmha_bwd_run(mode_enum mode,
                 ds_hp_host_ref.mDesc.get_lengths()[1],
                 ds_hp_host_ref.mDesc.get_lengths()[2])(std::thread::hardware_concurrency());
 
+            if(sink_grad)
+            {
+                // Reference: dSink[h] = -sum_q( P_sink[h,q] * D[h,q] )
+                // where D[h,q] = sum_j(dO[h,q,j] * O[h,q,j]) * p_undrop
+                for(int i_h = 0; i_h < nhead; ++i_h)
+                {
+                    AccDataType d_sink_head_acc = 0;
+                    for(int i_q = 0; i_q < real_seqlen_q; ++i_q)
+                    {
+                        AccDataType do_dot_o = 0;
+                        for(int o = 0; o < hdim_v; o++)
+                        {
+                            do_dot_o +=
+                                ck_tile::type_convert<AccDataType>(do_host_ref(i_h, i_q, o)) *
+                                ck_tile::type_convert<AccDataType>(
+                                    o_host_refs[ref_idx](i_h, i_q, o)) *
+                                p_undrop;
+                        }
+                        d_sink_head_acc += -p_sink_host_refs[ref_idx](i_h, i_q) * do_dot_o;
+                    }
+                    d_sink_host_ref(i_h) += d_sink_head_acc;
+                }
+            }
+
             if(use_dbias)
             {
                 dbias_host_ref = ds_hp_host_ref.template CopyAsType<BiasGradDataType>();
@@ -1044,6 +1147,17 @@ bwd_result fmha_bwd_run(mode_enum mode,
             ref_idx++;
         }
 
+        if(pass && sink_grad)
+        {
+            auto [rtol, atol] = get_elimit<DataTypeConfig>(hdim_q, hdim_v);
+            bool dsink_pass   = ck_tile::check_err(d_sink_host,
+                                                 d_sink_host_ref,
+                                                 std::string("Error: SinkGrad Incorrect results!"),
+                                                 rtol,
+                                                 atol);
+            pass &= dsink_pass;
+        }
+
         std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 521f1e4738..7d7d01bd05 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -844,6 +844,9 @@ auto fmha_fwd_v3_create_kargs_and_grids(fmha_fwd_args args)
             return FmhaKernel::MakeKargs(args.q_ptr,
                                          args.k_ptr,
                                          args.v_ptr,
+                                         args.q_descale_ptr,
+                                         args.k_descale_ptr,
+                                         args.v_descale_ptr,
                                          nullptr, // lse_ptr
                                          args.o_ptr,
                                          args.seqstart_q_ptr,
@@ -877,6 +880,9 @@ auto fmha_fwd_v3_create_kargs_and_grids(fmha_fwd_args args)
             return FmhaKernel::MakeKargs(args.q_ptr,
                                          args.k_ptr,
                                          args.v_ptr,
+                                         args.q_descale_ptr,
+                                         args.k_descale_ptr,
+                                         args.v_descale_ptr,
                                          nullptr, // lse_ptr
                                          args.o_ptr,
                                          args.seqlen_q,
diff --git a/example/ck_tile/01_fmha/fmha_fwd_head_grouping.hpp b/example/ck_tile/01_fmha/fmha_fwd_head_grouping.hpp
index 9cd1fb9cdc..9dad951d41 100644
--- a/example/ck_tile/01_fmha/fmha_fwd_head_grouping.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd_head_grouping.hpp
@@ -8,13 +8,16 @@
 #include <algorithm>
 #include <cctype>
 #include <cstdio>
-#include <dirent.h>
 #include <fstream>
 #include <iostream>
 #include <limits>
 #include <optional>
 #include <string>
 
+#ifdef __linux__
+#include <dirent.h>
+#endif
+
 #ifndef CK_TILE_FMHA_ENABLE_HEAD_GROUPING
 #define CK_TILE_FMHA_ENABLE_HEAD_GROUPING 1
 #endif
@@ -70,6 +73,8 @@ inline std::optional<long long> read_property_value(const std::string& filepath,
     return std::nullopt;
 }
 
+#if defined(__linux__)
+
 struct kfd_device_location
 {
     int domain      = 0;
@@ -176,6 +181,12 @@ inline size_t get_kfd_sysfs_llc_cache_bytes()
     return read_kfd_node_l3_bytes(*node);
 }
 
+#else
+
+inline size_t get_kfd_sysfs_llc_cache_bytes() { return 0; }
+
+#endif
+
 inline size_t get_default_llc_cache_bytes_for_arch(const std::string& arch);
 
 inline size_t resolve_llc_cache_bytes_uncached(const std::string& arch)
diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh
index 456c3986fa..4fbde37cae 100755
--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
@@ -39,7 +39,6 @@ function print_log_header(){
 #run verification tests
 time example/ck_tile/01_fmha/script/smoke_test_fwd.sh
 time example/ck_tile/01_fmha/script/smoke_test_bwd.sh
-time example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
 
 #run performance benchmarks
 export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
diff --git a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
index 81617ee16c..c246ccb98f 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
@@ -69,6 +69,28 @@ test_h_s_mask -prec=fp16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=0 -operm=0
 test_h_s_mask -prec=bf16 -d=$hdim -bias=n -dbias=0 -p_drop=0   -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
 test_h_s_mask -prec=bf16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
 done
+
+# sink gradient tests: same coverage as main tests but with -sink_grad=1
+for prec in "fp16" "bf16" ; do
+for perm in 0 1 ; do
+for hdim in 64 128 256 ; do
+for mode in 0 1 ; do
+for bias in "n" "a" ; do
+for p_drop in 0.0 0.2 ; do
+test_h_s_mask -prec=$prec -d=$hdim -bias=$bias -dbias=0 -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=0 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -sink_grad=1
+done
+done
+done
+done
+done
+done
+
+# sink gradient additional cases: non-standard hdim
+for hdim in 40 48 72 96 ; do
+test_h_s_mask -prec=fp16 -d=$hdim -bias=n -dbias=0 -p_drop=0   -iperm=0 -operm=0 -deterministic=0 -v=1 -mode=0 -kname=$KNAME $COMMON_ARGS -sink_grad=1
+test_h_s_mask -prec=fp16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS -sink_grad=1
+test_h_s_mask -prec=bf16 -d=$hdim -bias=n -dbias=0 -p_drop=0   -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS -sink_grad=1
+done
 set +x
 
 new_fails_count=0
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
index 227f26c8f3..1e9942a6e1 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -235,6 +235,64 @@ run_padding_basic_boundary_tests() {
     done
 }
 
+# Sink-specific mask pattern tests (sliding window + sink token).
+run_sink_mask_tests() {
+    # window_size[2,0], sink_size=2  (top-left causal + sink)
+    #    before:              after:
+    #    1 * * * * * * *      1 * * * * * * *
+    #    1 1 * * * * * *      1 1 * * * * * *
+    #    1 1 1 * * * * *      1 1 1 * * * * *
+    #    * 1 1 1 * * * *      1 1 1 1 * * * *
+    #    * * 1 1 1 * * *      1 1 1 1 1 * * *
+    #    * * * 1 1 1 * *      1 1 * 1 1 1 * *
+    #    * * * * 1 1 1 *      1 1 * * 1 1 1 *
+    #    * * * * * 1 1 1      1 1 * * * 1 1 1
+    run_exe -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=512   -s_k=512   -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=t:2,0,2
+    run_exe -prec=bf16 -mode=0 -b=2 -h=2 -d=128 -d_v=128 -s=512   -s_k=512   -bias=n -lse=0 -iperm=1 -operm=1 -vlayout=r -num_splits=1 -page_block_size=128   -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=t:2,0,2
+
+    # window_size[0,3], sink_size=2  (top-left + sink)
+    #    before:              after:
+    #    1 1 1 1 * * * *      1 1 1 1 * * * *
+    #    * 1 1 1 1 * * *      1 1 1 1 1 * * *
+    #    * * 1 1 1 1 * *      1 1 1 1 1 1 * *
+    #    * * * 1 1 1 1 *      1 1 * 1 1 1 1 *
+    #    * * * * 1 1 1 1      1 1 * * 1 1 1 1
+    run_exe -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=1024  -s_k=1024  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=t:0,3,2
+    run_exe -prec=bf16 -mode=1 -b=2 -h=2 -d=128 -d_v=128 -s=1024  -s_k=1024  -bias=n -lse=0 -iperm=1 -operm=1 -vlayout=r -num_splits=1 -page_block_size=128   -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=t:0,3,2
+
+    # window_size[1,0], sink_size=2  (bottom-right + sink)
+    #    before:              after:
+    #    * * 1 1 * * * *      1 1 1 1 * * * *
+    #    * * * 1 1 * * *      1 1 * 1 1 * * *
+    #    * * * * 1 1 * *      1 1 * * 1 1 * *
+    #    * * * * * 1 1 *      1 1 * * * 1 1 *
+    #    * * * * * * 1 1      1 1 * * * * 1 1
+    run_exe -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=4096  -s_k=4096  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:1,0,2
+    run_exe -prec=bf16 -mode=0 -b=2 -h=4 -d=128 -d_v=128 -s=2048  -s_k=2048  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:1,0,2
+
+    # window_size[2,0], sink_size=2  (bottom-right, group mode + sink)
+    run_exe -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=8192  -s_k=8192  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:2,0,2
+    run_exe -prec=bf16 -mode=1 -b=2 -h=2 -d=128 -d_v=128 -s=4096  -s_k=4096  -bias=n -lse=0 -iperm=1 -operm=1 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:2,0,2
+
+    # window_size[-1,1], sink_size=2  (bottom-right, large seqlen + sink)
+    run_exe -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=16384 -s_k=16384 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:-1,1,2
+    run_exe -prec=bf16 -mode=1 -b=1 -h=2 -d=128 -d_v=128 -s=8192  -s_k=8192  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:-1,1,2
+}
+
+# init_sink tests: validate sink token initialization across prec/hdim/mode.
+run_sink_init_tests() {
+    for prec in "fp16" "bf16" ; do
+    for hdim in 64 128 256 ; do
+    for mode in 0 1 ; do
+    for mask in 0 1 ; do
+        run_exe -prec=$prec -mode=$mode -b=1 -h=2 -d=$hdim -d_v=$hdim -s=512  -s_k=512  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=$KNAME $COMMON_ARGS -init_sink=1 -mask=$mask
+        run_exe -prec=$prec -mode=$mode -b=2 -h=4 -d=$hdim -d_v=$hdim -s=1024 -s_k=1024 -bias=n -lse=0 -iperm=1 -operm=1 -vlayout=r -kname=$KNAME $COMMON_ARGS -init_sink=1 -mask=$mask
+    done
+    done
+    done
+    done
+}
+
 set -x
 
 run_fp16_bf16_tests
@@ -242,6 +300,8 @@ run_padding_smoke_tests
 run_padding_basic_boundary_tests
 run_fp8bf16_tests
 run_fp8fp32_tests
+run_sink_mask_tests
+run_sink_init_tests
 
 if [ $TEST_APPENDKV -eq 1 ] ; then
     run_fp16_appendkv_tests
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
deleted file mode 100755
index 5c9d3132b3..0000000000
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-# TODO: run this script from CK root or build directory
-#EXE="/code/composable_kernel/build/bin/tile_example_fmha_fwd"
-set -euo pipefail
-
-SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
-EXE_NAME=tile_example_fmha_fwd
-EXE="$(find . -name $EXE_NAME -type f | head -n 1)"
-KNAME=1
-GPU_arch=$GPU_arch
-if [ -z "$GPU_arch" ] ; then
-    GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}')
-fi
-set -x
-
-COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
-
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=512 -s_k=512 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=t:2,0,2
-
-# window_size[2,0], sink_size = 2
-
-#    x=1/y=3                 
-#    1 * * * * * * *           1 * * * * * * *  
-#    1 1 * * * * * *           1 1 * * * * * *
-#    1 1 1 * * * * *   ---->   1 1 1 * * * * * 
-#    * 1 1 1 * * * *           1 1 1 1 * * * * 
-#    * * 1 1 1 * * *           1 1 1 1 1 * * * 
-#    * * * 1 1 1 * *           1 1 * 1 1 1 * * 
-#    * * * * 1 1 1 *           1 1 * * 1 1 1 *
-#    * * * * * 1 1 1           1 1 * * * 1 1 1
-#    l=2/r=0(tl)               l=2/r=0/s=2(tl)
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=1024 -s_k=1024 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=t:0,3,2 #-mask=b:3,0,2
-
-#    x=4/y=1                   
-#    1 1 1 1 * * * *           1 1 1 1 * * * * 
-#    * 1 1 1 1 * * *           1 1 1 1 1 * * *
-#    * * 1 1 1 1 * *   ---->   1 1 1 1 1 1 * *
-#    * * * 1 1 1 1 *           1 1 * 1 1 1 1 *
-#    * * * * 1 1 1 1           1 1 * * 1 1 1 1 
-#    l=0/r=3(tl)               l=0/r=3/s=2(tl)
-#    l=3/r=0(br)               l=3/r=0/s=2(br)  
-
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=4096 -s_k=4096 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:1,0,2
-
-#    x=4/y=-1          
-#    * * 1 1 * * * *           1 1 1 1 * * * * 
-#    * * * 1 1 * * *           1 1 * 1 1 * * *
-#    * * * * 1 1 * *   ---->   1 1 * * 1 1 * *
-#    * * * * * 1 1 *           1 1 * * * 1 1 *
-#    * * * * * * 1 1           1 1 * * * * 1 1 
-#    l=1/r=0(br)               l=1/r=0/s=2(br)
-
-
-$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=8192 -s_k=8192 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:2,0,2
-
-#    x=-1/y=5 
-     
-#    * * * * * *               * * * * * *
-#    * * * * * *               * * * * * *
-#    1 * * * * *               1 * * * * *
-#    1 1 * * * *               1 1 * * * *
-#    1 1 1 * * *       ---->   1 1 1 * * *
-#    * 1 1 1 * *               1 1 1 1 * *
-#    * * 1 1 1 *               1 1 1 1 1 *  
-#    * * * 1 1 1               1 1 * 1 1 1
-#    l=2/r=0(br)               l=2/r=0/s=2(br)
-
-
-$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=16384 -s_k=16384 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:-1,1,2
-#      x=-1/y=8
-#    * * * * *               * * * * *    
-#    * * * * *               * * * * * 
-#    1 * * * *      ---->    1 * * * * 
-#    1 1 * * *               1 1 * * * 
-#    1 1 1 * *               1 1 1 * * 
-#    1 1 1 1 *               1 1 1 1 * 
-#    1 1 1 1 1               1 1 1 1 1 
-#    1 1 1 1 1               1 1 1 1 1 
-#    l=2/r=0(br)             l=2/r=0/s=2(br)
-     
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=512 -s_k=512 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1 -mask=1
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=1024 -s_k=1024 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1 -mask=0
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=4096 -s_k=4096 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1
-
-$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=8192 -s_k=8192 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1 -mask=1
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 39dd6357e5..4d13bca2a0 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -284,12 +284,9 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
         b_k_n.SetZero();
     }
 
-    if(!preshuffle && GemmConfig::UseStructuredSparsity)
+    if constexpr(!preshuffle && GemmConfig::UseStructuredSparsity)
     {
-        if constexpr(GemmConfig::UseStructuredSparsity)
-        {
-            ck_tile::AdjustToStructuredSparsity<ADataTypeBuf>{}(a_m_k);
-        }
+        ck_tile::AdjustToStructuredSparsity<ADataTypeBuf>{}(a_m_k);
     }
 
     ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
index 8f4813a47e..ca49114844 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
index e357d7e3ac..f754d8e959 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
index 8a5e0c74a0..66f427247a 100644
--- a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
index 9c08cf64f0..103f7281b0 100644
--- a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
index 8c72b81dc1..56fcca3beb 100644
--- a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
index 6d7a5e7c1f..2462cd218e 100644
--- a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/example/ck_tile/20_grouped_convolution/CMakeLists.txt b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
index 090aae482b..18e71c255d 100644
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -17,6 +17,12 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
   add_executable(tile_example_grouped_conv_bwd_weight grouped_convolution_backward_weight.cpp)
   target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
 
+  # StreamK requires cross-CU coherence (StreamKCoherency), CDNA only.
+  if(GPU_TARGETS MATCHES "gfx90a|gfx942|gfx950")
+    add_executable(tile_example_grouped_conv_bwd_weight_streamk grouped_convolution_backward_weight_streamk.cpp)
+    target_compile_options(tile_example_grouped_conv_bwd_weight_streamk PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
+  endif()
+
   add_executable(tile_example_grouped_conv_bwd_weight_two_stage grouped_convolution_backward_weight_two_stage.cpp)
   target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
 
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
index 8287d1171c..6abc002207 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
@@ -17,7 +17,7 @@
 template <template <typename PrecType> typename ConvConfig>
 int run_grouped_conv_bwd_weight_example(ck_tile::ArgParser& arg_parser)
 {
-    using Invoker = GroupedConvolutionBackwardWeightInvoker;
+    using Invoker = GroupedConvolutionBackwardWeightInvoker<>;
 
     std::string data_type  = arg_parser.get_str("prec");
     std::string in_layout  = arg_parser.get_str("in_layout");
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp
index ce6b1ea154..40e45bc6c0 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp
@@ -2,7 +2,28 @@
 // SPDX-License-Identifier: MIT
 #pragma once
 #include "grouped_convolution_utils.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp"
 
+/// @brief Partitioner policies for the conv bwd weight invoker.
+/// SplitKPartitionerPolicy is the default (data-parallel + split-K).
+/// StreamKPartitionerPolicy selects StreamK work distribution.
+struct SplitKPartitionerPolicy
+{
+    template <typename GemmShape, typename GroupedConvTraitsType>
+    using type = ck_tile::GemmSpatiallyLocalTilePartitioner<
+        GemmShape,
+        GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
+        GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
+};
+
+template <ck_tile::StreamKReductionStrategy ReductionStrategy, bool Persistent = false>
+struct StreamKPartitionerPolicy
+{
+    template <typename GemmShape, typename>
+    using type = ck_tile::StreamKTilePartitioner<GemmShape, ReductionStrategy, Persistent>;
+};
+
+template <typename PartitionerPolicy = SplitKPartitionerPolicy>
 struct GroupedConvolutionBackwardWeightInvoker
 {
     template <ck_tile::index_t NDimSpatial,
@@ -40,10 +61,8 @@ struct GroupedConvolutionBackwardWeightInvoker
                                                                  ConvConfig::VectorSizeC,
                                                                  ConvConfig::NumGroupsToMerge>;
 
-        using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
-            GemmShape,
-            GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
-            GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
+        using TilePartitioner =
+            typename PartitionerPolicy::template type<GemmShape, GroupedConvTraitsType>;
 
         using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
             GroupedConvTraitsType::FixedGemmParams::kPadM,
@@ -103,7 +122,7 @@ struct GroupedConvolutionBackwardWeightInvoker
                                                                        ConvEpilogue>;
         auto kargs   = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids  = Kernel::GridSize(args);
+        const dim3 grids  = Kernel::GridSize(kargs);
         const dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
@@ -111,6 +130,12 @@ struct GroupedConvolutionBackwardWeightInvoker
             throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
         }
 
+        // Workspace: may be non-zero for StreamK (depends on SK/DP tile split),
+        // always zero for Split-K.
+        auto ws_size = Kernel::GetWorkSpaceSize(kargs);
+        ck_tile::DeviceMem workspace_dev(ws_size);
+        Kernel::SetWorkSpacePointer(kargs, workspace_dev.GetDeviceBuffer());
+
         if(s.log_level_ > 0)
         {
             std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
@@ -120,14 +145,25 @@ struct GroupedConvolutionBackwardWeightInvoker
                       << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                       << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
                       << '\n'
+                      << "workspace: " << ws_size << " bytes" << '\n'
                       << "Vector size A: " << GemmPipeline::GetVectorSizeA()
                       << ", Vector size B: " << GemmPipeline::GetVectorSizeB()
                       << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
         }
 
         auto preprocess = [&]() {
-            if(kargs.k_batch > 1)
+            if constexpr(Kernel::IsStreamK)
             {
+                // StreamK: zero workspace flags before each kernel launch
+                if(ws_size > 0)
+                {
+                    ck_tile::hip_check_error(
+                        hipMemsetAsync(workspace_dev.GetDeviceBuffer(), 0, ws_size, s.stream_id_));
+                }
+            }
+            else if(kargs.k_batch > 1)
+            {
+                // Split-K: zero weight buffer for atomic accumulation
                 ck_tile::hip_check_error(hipMemsetAsync(
                     kargs.wei_ptr, 0, args.template GetWeightByte<WeiDataType>(), s.stream_id_));
             }
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp
new file mode 100644
index 0000000000..fa6cf38cf0
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "grouped_convolution_utils.hpp"
+#include "grouped_convolution_backward_weight_invoker.hpp"
+#include "run_grouped_convolution_bwd_weight_example.inc"
+
+template <template <typename PrecType> typename ConvConfig, typename Invoker>
+int run_grouped_conv_bwd_weight_example(ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type  = arg_parser.get_str("prec");
+    std::string in_layout  = arg_parser.get_str("in_layout");
+    std::string wei_layout = arg_parser.get_str("wei_layout");
+    std::string out_layout = arg_parser.get_str("out_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<Invoker,
+                                                             ConvConfig<ck_tile::half_t>,
+                                                             ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, arg_parser);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<Invoker,
+                                                             ConvConfig<ck_tile::bf16_t>,
+                                                             ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, arg_parser);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] =
+        create_args(argc,
+                    argv,
+                    {
+                        {"streamk_reduction", "tree", "StreamK reduction strategy: linear or tree"},
+                        {"streamk_persistent", "0", "Use persistent DP (1) or non-persistent (0)"},
+                    });
+    if(!result)
+        return -1;
+
+    try
+    {
+        const std::string reduction = arg_parser.get_str("streamk_reduction");
+        const bool persistent       = arg_parser.get_int("streamk_persistent") != 0;
+
+        // Dispatch on reduction strategy × persistent DP
+        if(reduction == "linear" && !persistent)
+        {
+            using Invoker = GroupedConvolutionBackwardWeightInvoker<
+                StreamKPartitionerPolicy<ck_tile::StreamKReductionStrategy::Linear, false>>;
+            return !run_grouped_conv_bwd_weight_example<ConvConfigComputeV3, Invoker>(arg_parser);
+        }
+        else if(reduction == "linear" && persistent)
+        {
+            using Invoker = GroupedConvolutionBackwardWeightInvoker<
+                StreamKPartitionerPolicy<ck_tile::StreamKReductionStrategy::Linear, true>>;
+            return !run_grouped_conv_bwd_weight_example<ConvConfigComputeV3, Invoker>(arg_parser);
+        }
+        else if(reduction == "tree" && !persistent)
+        {
+            using Invoker = GroupedConvolutionBackwardWeightInvoker<
+                StreamKPartitionerPolicy<ck_tile::StreamKReductionStrategy::Tree, false>>;
+            return !run_grouped_conv_bwd_weight_example<ConvConfigComputeV3, Invoker>(arg_parser);
+        }
+        else if(reduction == "tree" && persistent)
+        {
+            using Invoker = GroupedConvolutionBackwardWeightInvoker<
+                StreamKPartitionerPolicy<ck_tile::StreamKReductionStrategy::Tree, true>>;
+            return !run_grouped_conv_bwd_weight_example<ConvConfigComputeV3, Invoker>(arg_parser);
+        }
+        else
+        {
+            std::cerr << "Unknown streamk_reduction: " << reduction
+                      << ". Use 'linear' or 'tree'.\n";
+            return EXIT_FAILURE;
+        }
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
index a78a880815..5a714f6da7 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
@@ -4,6 +4,8 @@
 #pragma once
 
 #include <string>
+#include <tuple>
+#include <vector>
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
@@ -80,7 +82,10 @@ ck_tile::index_t fill_spatial_dimensions(std::vector<ck_tile::index_t>& filter_s
     return n_dim_sp;
 }
 
-auto create_args(int argc, char* argv[])
+auto create_args(
+    int argc,
+    char* argv[],
+    const std::vector<std::tuple<std::string, std::string, std::string>>& extra_args = {})
 {
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("g", "2", "group dimension")
@@ -124,6 +129,12 @@ auto create_args(int argc, char* argv[])
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
         .insert("json", "0", "0: No Json, 1: Dump Results in Json format");
 
+    // Allow per-binary CLI customization (e.g., StreamK adds --streamk_reduction).
+    for(const auto& [key, default_val, help] : extra_args)
+    {
+        arg_parser.insert(key, default_val, help);
+    }
+
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp
index 6b883ecfc9..cf906e9685 100644
--- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp
+++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp
@@ -470,4 +470,17 @@ concept SpecifiesDlEpilogue = requires {
     { T::transfer.c } -> DlEpilogueDescriptor;
 };
 
+// Concept to detect StreamK configuration in a tile algorithm descriptor.
+template <typename T>
+concept StreamKDescriptor = requires(T t) {
+    { t.reduction_strategy } -> std::convertible_to<StreamKReductionStrategy>;
+    { t.persistent } -> std::convertible_to<bool>;
+};
+
+// Concept to check if a tile algorithm specifies StreamK work distribution.
+template <typename T>
+concept SpecifiesStreamK = requires {
+    { T::streamk } -> StreamKDescriptor;
+};
+
 } // namespace ck_tile::builder
diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp
index 05f91f4e9f..2607d2b69b 100644
--- a/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp
+++ b/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp
@@ -65,10 +65,8 @@ struct ConvTileFactory
         ck_tile::sequence<BLOCK_GEMM.warps.m, BLOCK_GEMM.warps.n, BLOCK_GEMM.warps.k>,
         ck_tile::sequence<BLOCK_GEMM.warp_tile.m, BLOCK_GEMM.warp_tile.n, BLOCK_GEMM.warp_tile.k>>;
 
-    using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
-        GemmShape,
-        GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
-        GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
+    using TilePartitioner =
+        typename internal::TilePartitionerType<ALGORITHM, GemmShape, GroupedConvTraitsType>::type;
 
     using ConvOutDataType = std::conditional_t<OPTIMIZATIONS.two_stage,
                                                typename Types::AccDataType,
diff --git a/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tuning_params.hpp b/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tuning_params.hpp
index 0f2fbed216..2c0292d7f0 100644
--- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tuning_params.hpp
+++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tuning_params.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp"
 #include "ck_tile/builder/conv_algorithm_concepts.hpp"
 #include "ck_tile/builder/types.hpp"
 
@@ -186,4 +187,38 @@ consteval TileOptimizations SetTileOptimizations()
                              .two_stage           = OPT.two_stage};
 }
 
+// Maps builder StreamKReductionStrategy to ck_tile::StreamKReductionStrategy.
+consteval ck_tile::StreamKReductionStrategy
+MapStreamKReductionStrategy(StreamKReductionStrategy strategy)
+{
+    switch(strategy)
+    {
+    case StreamKReductionStrategy::LINEAR: return ck_tile::StreamKReductionStrategy::Linear;
+    case StreamKReductionStrategy::TREE: return ck_tile::StreamKReductionStrategy::Tree;
+    default: throw "Unknown StreamKReductionStrategy";
+    }
+}
+
+// Selects the tile partitioner type based on whether the algorithm specifies StreamK.
+// Usage: typename TilePartitionerType<ALGORITHM, GemmShape, ConvTraitsType>::type
+template <ConvAlgorithmDescriptor auto ALGORITHM, typename GemmShape_, typename ConvTraitsType_>
+struct TilePartitionerType
+{
+    using type = ck_tile::GemmSpatiallyLocalTilePartitioner<
+        GemmShape_,
+        ConvTraitsType_::FixedGemmParams::TilePartitionerGroupNum,
+        ConvTraitsType_::FixedGemmParams::TilePartitionerM01>;
+};
+
+template <ConvAlgorithmDescriptor auto ALGORITHM, typename GemmShape_, typename ConvTraitsType_>
+    requires SpecifiesStreamK<decltype(ALGORITHM)>
+struct TilePartitionerType<ALGORITHM, GemmShape_, ConvTraitsType_>
+{
+    static constexpr auto CK_STRATEGY =
+        MapStreamKReductionStrategy(ALGORITHM.streamk.reduction_strategy);
+    static constexpr bool PERSISTENT = ALGORITHM.streamk.persistent;
+
+    using type = ck_tile::StreamKTilePartitioner<GemmShape_, CK_STRATEGY, PERSISTENT>;
+};
+
 } // namespace ck_tile::builder::factory::internal
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle.hpp
index df2a3532c9..3be99ae7ca 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle.hpp
@@ -296,45 +296,45 @@ struct InstanceTraits<
         oss << ","
             << detail::conv_bwd_data_spec_name(
                    kConvBwdDataSpecialization); // 14. ConvBackwardDataSpecialization
-        oss << "," << kDoPadGemmM;
-        oss << "," << kDoPadGemmN;
-        oss << "," << kNumGemmKPrefetchStage;
-        oss << "," << kBlockSize;   // 15. BlockSize
-        oss << "," << kMPerBlock;   // 16. MPerBlock
-        oss << "," << kNPerBlock;   // 17. NPerBlock
-        oss << "," << kK0PerBlock;  // 18. K0PerBlock
-        oss << "," << kAK1;         // 19. AK1
-        oss << "," << kBK1;         // 19. BK1
-        oss << "," << kMPerXDL;     // 20. MPerXDL
-        oss << "," << kNPerXDL;     // 21. NPerXDL
-        oss << "," << kMXdlPerWave; // 22. MXdlPerWave
-        oss << "," << kNXdlPerWave; // 23. NXdlPerWave
-        oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 24.
-        oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>();    // 25.
-        oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>();               // 26.
-        oss << "," << kABlockTransferSrcVectorDim;                                         // 27.
-        oss << "," << kABlockTransferSrcScalarPerVector;                                   // 28.
-        oss << "," << kABlockTransferDstScalarPerVectorK1;                                 // 29.
-        oss << "," << (kABlockLdsExtraM ? "true" : "false");                               // 30.
-        oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 31.
-        oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>();    // 32.
-        oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>();               // 33.
-        oss << "," << kBBlockTransferSrcVectorDim;                                         // 34.
-        oss << "," << kBBlockTransferSrcScalarPerVector;                                   // 35.
-        oss << "," << kBBlockTransferDstScalarPerVectorK1;                                 // 36.
-        oss << "," << (kBBlockLdsExtraN ? "true" : "false");                               // 37.
-        oss << "," << kCShuffleMXdlPerWavePerShuffle;                                      // 38.
-        oss << "," << kCShuffleNXdlPerWavePerShuffle;                                      // 39.
+        oss << "," << kDoPadGemmM;              // 15. GEMM padding for M dimension
+        oss << "," << kDoPadGemmN;              // 16. GEMM padding for N dimension
+        oss << "," << kNumGemmKPrefetchStage;   // 17. Number of GEMM K prefetch stages
+        oss << "," << kBlockSize;               // 18. BlockSize
+        oss << "," << kMPerBlock;               // 19. MPerBlock
+        oss << "," << kNPerBlock;               // 20. NPerBlock
+        oss << "," << kK0PerBlock;              // 21. K0PerBlock
+        oss << "," << kAK1;                     // 22. AK1
+        oss << "," << kBK1;                     // 23. BK1
+        oss << "," << kMPerXDL;                 // 24. MPerXDL
+        oss << "," << kNPerXDL;                 // 25. NPerXDL
+        oss << "," << kMXdlPerWave;             // 26. MXdlPerWave
+        oss << "," << kNXdlPerWave;             // 27. NXdlPerWave
+        oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 28.
+        oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>();    // 29.
+        oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>();               // 30.
+        oss << "," << kABlockTransferSrcVectorDim;                                         // 31.
+        oss << "," << kABlockTransferSrcScalarPerVector;                                   // 32.
+        oss << "," << kABlockTransferDstScalarPerVectorK1;                                 // 33.
+        oss << "," << (kABlockLdsExtraM ? "true" : "false");                               // 34.
+        oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 35.
+        oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>();    // 36.
+        oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>();               // 37.
+        oss << "," << kBBlockTransferSrcVectorDim;                                         // 38.
+        oss << "," << kBBlockTransferSrcScalarPerVector;                                   // 39.
+        oss << "," << kBBlockTransferDstScalarPerVectorK1;                                 // 40.
+        oss << "," << (kBBlockLdsExtraN ? "true" : "false");                               // 41.
+        oss << "," << kCShuffleMXdlPerWavePerShuffle;                                      // 42.
+        oss << "," << kCShuffleNXdlPerWavePerShuffle;                                      // 43.
         oss << ","
             << detail::sequence_name<
-                   CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 40.
-        oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl;                    // 42.
-        oss << "," << kNumGemmKPrefetchStage;                                         // 41.
-        oss << "," << detail::loop_scheduler_name(kLoopScheduler); // 43. LoopSched
-        oss << "," << detail::type_name<ComputeTypeA>();           // 44.
-        oss << "," << detail::type_name<ComputeTypeB>();           // 45.
-        oss << "," << kMaxTransposeTransferSrcScalarPerVector;     // 46.
-        oss << "," << kMaxTransposeTransferDstScalarPerVector;     // 47.
+                   CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 44.
+        oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl;                    // 45.
+        oss << "," << kNumGemmKPrefetchStage;                                         // 46.
+        oss << "," << detail::loop_scheduler_name(kLoopScheduler); // 47. LoopSched
+        oss << "," << detail::type_name<ComputeTypeA>();           // 48.
+        oss << "," << detail::type_name<ComputeTypeB>();           // 49.
+        oss << "," << kMaxTransposeTransferSrcScalarPerVector;     // 50.
+        oss << "," << kMaxTransposeTransferDstScalarPerVector;     // 51.
 
         oss << ">";
 
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp
index f911b8dc83..ea292d8bc4 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp
@@ -66,14 +66,18 @@ struct InstanceTraits<ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedCon
     static constexpr int kMPerBlock = TilePartitioner_::MPerBlock;
     static constexpr int kNPerBlock = TilePartitioner_::NPerBlock;
     static constexpr int kKPerBlock = TilePartitioner_::KPerBlock;
+    // Partitioner — detect StreamK by checking for PERSISTENT member
+    static constexpr bool kIsStreamK = requires { TilePartitioner_::PERSISTENT; };
 
-    static constexpr int kMWarp = TilePartitioner_::BlockGemmShape::BlockWarps::at(number<0>{});
-    static constexpr int kNWarp = TilePartitioner_::BlockGemmShape::BlockWarps::at(number<1>{});
-    static constexpr int kKWarp = TilePartitioner_::BlockGemmShape::BlockWarps::at(number<2>{});
-
-    static constexpr int kMWarpTile = TilePartitioner_::BlockGemmShape::WarpTile::at(number<0>{});
-    static constexpr int kNWarpTile = TilePartitioner_::BlockGemmShape::WarpTile::at(number<1>{});
-    static constexpr int kKWarpTile = TilePartitioner_::BlockGemmShape::WarpTile::at(number<2>{});
+    // Warp configuration — sourced from pipeline's BlockGemmShape (works for both
+    // GemmSpatiallyLocalTilePartitioner and StreamKTilePartitioner).
+    using BlockGemmShape_           = typename GemmPipeline_::BlockGemmShape;
+    static constexpr int kMWarp     = BlockGemmShape_::BlockWarps::at(number<0>{});
+    static constexpr int kNWarp     = BlockGemmShape_::BlockWarps::at(number<1>{});
+    static constexpr int kKWarp     = BlockGemmShape_::BlockWarps::at(number<2>{});
+    static constexpr int kMWarpTile = BlockGemmShape_::WarpTile::at(number<0>{});
+    static constexpr int kNWarpTile = BlockGemmShape_::WarpTile::at(number<1>{});
+    static constexpr int kKWarpTile = BlockGemmShape_::WarpTile::at(number<2>{});
 
     // Data types
     using ADataType = typename GemmPipeline_::ADataType;
@@ -133,6 +137,13 @@ struct InstanceTraits<ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedCon
         oss << ","
             << detail::elementwise_op_name<CDEElementwiseOperation>(); // 31.
                                                                        // CDEElementwiseOperation
+        oss << "," << kIsStreamK;                                      // 32. IsStreamK
+        if constexpr(kIsStreamK)
+        {
+            oss << ","
+                << static_cast<int>(TilePartitioner_::ReductionStrategy); // 33. ReductionStrategy
+            oss << "," << TilePartitioner_::PERSISTENT;                   // 34. PersistentDP
+        }
         oss << ">";
 
         return oss.str();
diff --git a/experimental/builder/include/ck_tile/builder/reflect/tree_formatter.hpp b/experimental/builder/include/ck_tile/builder/reflect/tree_formatter.hpp
index 8657e8dd2c..ee18d407c1 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/tree_formatter.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/tree_formatter.hpp
@@ -47,7 +47,7 @@ class TreeFormatter
 
     // Add a child node, returns a reference to it for further nesting
     template <typename... Args>
-    TreeFormatter& add(Args&&... args)
+    TreeFormatter& add(Args&&... args) [[clang::lifetimebound]]
     {
         children_.emplace_back(std::forward<Args>(args)...);
         return children_.back();
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv/bwd_data.hpp b/experimental/builder/include/ck_tile/builder/testing/conv/bwd_data.hpp
new file mode 100644
index 0000000000..bf9012e867
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/testing/conv/bwd_data.hpp
@@ -0,0 +1,71 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/builder/testing/tensor_initialization.hpp"
+#include "ck_tile/builder/testing/testing_reflect.hpp"
+#include "ck_tile/builder/testing/conv/args.hpp"
+#include "ck_tile/builder/testing/conv/fwd.hpp"
+#include "ck_tile/builder/testing/error.hpp"
+
+/// This file deals with the backward data-specific details of running grouped
+/// convolution backwards data operations. It mainly defines the data
+/// structures (`Input` and `Output`), initialization, and validation. Note
+/// that for this operation specifically, many of the operations are
+/// implemented automatically via testing_reflect.hpp.
+
+namespace ck_tile::builder::test {
+
+/// @brief `Inputs` specialization for backwards data convolution.
+///
+/// @tparam SIGNATURE Backwards data convolution signature.
+///
+/// @see Inputs
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsBackwardData<SIGNATURE>
+struct Inputs<SIGNATURE>
+{
+    void* weight;
+    void* output;
+
+    // See testing_reflect.hpp
+    static void reflect(const Args<SIGNATURE>& args, const auto& inspect)
+    {
+        inspect("weight", args.make_weight_descriptor(), &Inputs<SIGNATURE>::weight);
+        inspect("output", args.make_output_descriptor(), &Inputs<SIGNATURE>::output);
+    }
+};
+
+/// @brief `Outputs` specialization for backwards data convolution.
+///
+/// @tparam SIGNATURE Backward data convolution signature.
+///
+/// @see Outputs
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsBackwardData<SIGNATURE>
+struct Outputs<SIGNATURE>
+{
+    void* input;
+
+    // See testing_reflect.hpp
+    static void reflect(const Args<SIGNATURE>& args, const auto& inspect)
+    {
+        inspect("input", args.make_input_descriptor(), &Outputs<SIGNATURE>::input);
+    }
+};
+
+/// @brief `init_inputs()` specialization for backwards convolution.
+///
+/// @tparam SIGNATURE Backward data convolution signature.
+///
+/// @see init_inputs()
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsBackwardData<SIGNATURE>
+void init_inputs(const Args<SIGNATURE>& args, Inputs<SIGNATURE> inputs)
+{
+    init_tensor_buffer_uniform_fp(inputs.weight, args.make_weight_descriptor(), -2.0f, 2.0f);
+    init_tensor_buffer_uniform_fp(inputs.output, args.make_output_descriptor(), -2.0f, 2.0f);
+}
+
+} // namespace ck_tile::builder::test
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp
index 862d965e5e..914c988d09 100644
--- a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/builder/testing/testing.hpp"
 #include "ck_tile/builder/testing/conv/fwd.hpp"
 #include "ck_tile/builder/testing/conv/bwd_weight.hpp"
+#include "ck_tile/builder/testing/conv/bwd_data.hpp"
 #include "ck_tile/builder/factory/helpers/ck_tile/conv_tile_tensor_type.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/gemm.hpp"
@@ -35,6 +36,29 @@ concept CkTileConvInstance = requires(Conv&) {
     { Conv::BlockSize() };
 };
 
+template <auto SIGNATURE>
+std::size_t gemm_split_k_output_size(auto kargs)
+{
+    std::size_t zeroing_size = 0;
+    if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
+    {
+        zeroing_size = std::accumulate(std::begin(kargs.wei_g_k_c_xs_lengths.data),
+                                       std::end(kargs.wei_g_k_c_xs_lengths.data),
+                                       1,
+                                       std::multiplies<std::size_t>());
+    }
+
+    if constexpr(ConvDirectionIsBackwardData<SIGNATURE>)
+    {
+        zeroing_size = std::accumulate(std::begin(kargs.in_g_n_c_wis_lengths.data),
+                                       std::end(kargs.in_g_n_c_wis_lengths.data),
+                                       1,
+                                       std::multiplies<std::size_t>());
+    }
+
+    return zeroing_size;
+}
+
 template <auto SIGNATURE, typename InDataType, typename WeiDataType, typename OutDataType>
 [[nodiscard]] RunResult run(CkTileConvInstance<SIGNATURE> auto& conv,
                             const Args<SIGNATURE>& args,
@@ -51,25 +75,53 @@ template <auto SIGNATURE, typename InDataType, typename WeiDataType, typename Ou
 
     auto kargs = Conv::MakeKernelArgs(host_args);
 
-    const dim3 grids  = Conv::GridSize(kargs);
-    const dim3 blocks = Conv::BlockSize();
-
     if(!Conv::IsSupportedArgument(kargs))
         return RunResult::not_supported("unsupported ck_tile arguments");
 
+    // Workspace allocation (bwd weight only): may be non-zero for StreamK.
+    [[maybe_unused]] std::size_t ws_size = 0;
+    if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
+        ws_size = Conv::GetWorkSpaceSize(kargs);
+    ck_tile::DeviceMem workspace_dev(ws_size);
+    if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
+        Conv::SetWorkSpacePointer(kargs, workspace_dev.GetDeviceBuffer());
+
+    const dim3 grids  = Conv::GridSize(kargs);
+    const dim3 blocks = Conv::BlockSize();
+
     using Types = ck_tile::builder::factory::internal::TileConvTensorTypes<SIGNATURE.data_type>;
-    const std::size_t zeroing_size = std::accumulate(std::begin(kargs.wei_g_k_c_xs_lengths.data),
-                                                     std::end(kargs.wei_g_k_c_xs_lengths.data),
-                                                     1,
-                                                     std::multiplies<std::size_t>());
+
+    const std::size_t zeroing_size = gemm_split_k_output_size<SIGNATURE>(kargs);
 
     auto preprocess = [&]() {
         if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
+        {
+            if constexpr(Conv::IsStreamK)
+            {
+                // StreamK: zero workspace flags before each kernel launch
+                if(ws_size > 0)
+                {
+                    ck_tile::hip_check_error(hipMemsetAsync(
+                        workspace_dev.GetDeviceBuffer(), 0, ws_size, s_conf.stream_id_));
+                }
+            }
+            else if(kargs.k_batch > 1)
+            {
+                // Split-K: zero weight buffer for atomic accumulation
+                ck_tile::hip_check_error(
+                    hipMemsetAsync(kargs.wei_ptr,
+                                   0,
+                                   zeroing_size * sizeof(typename Types::EDataType),
+                                   s_conf.stream_id_));
+            }
+        }
+
+        if constexpr(ConvDirectionIsBackwardData<SIGNATURE>)
         {
             if(kargs.k_batch > 1)
             {
                 ck_tile::hip_check_error(
-                    hipMemsetAsync(kargs.wei_ptr,
+                    hipMemsetAsync(kargs.in_ptr,
                                    0,
                                    zeroing_size * sizeof(typename Types::EDataType),
                                    s_conf.stream_id_));
@@ -293,4 +345,26 @@ template <auto SIGNATURE>
                        s_conf);
 }
 
+/// @brief `run()` specialization for backwards data convolution and CK Tile.
+///
+/// @tparam SIGNATURE Backward data convolution signature.
+/// @returns RunResult about how the operation completed (or not).
+///
+/// @see run()
+template <auto SIGNATURE>
+    requires ConvDirectionIsBackwardData<SIGNATURE>
+[[nodiscard]] RunResult run(CkTileConvInstance<SIGNATURE> auto& conv,
+                            const Args<SIGNATURE>& args,
+                            const Inputs<SIGNATURE>& inputs,
+                            const Outputs<SIGNATURE>& outputs,
+                            const ck_tile::stream_config s_conf = {})
+{
+    return detail::run(conv,
+                       args,
+                       static_cast<void*>(outputs.input),
+                       static_cast<const void*>(inputs.weight),
+                       static_cast<const void*>(inputs.output),
+                       s_conf);
+}
+
 } // namespace ck_tile::builder::test
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv/reference.hpp b/experimental/builder/include/ck_tile/builder/testing/conv/reference.hpp
index 169d0741ff..50f97e7397 100644
--- a/experimental/builder/include/ck_tile/builder/testing/conv/reference.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv/reference.hpp
@@ -134,4 +134,26 @@ template <auto SIGNATURE>
     return detail::run(conv, args, inputs.input, outputs.weight, inputs.output);
 }
 
+/// @brief Concept for checking whether this is the reference convolution
+/// backward data implementation.
+template <typename Conv, auto SIGNATURE>
+concept RefConvBwdDataInstance =
+    detail::RefConvInstance<Conv, SIGNATURE, void*, const void*, const void*> &&
+    ConvDirectionIsBackwardData<SIGNATURE>;
+
+/// @brief `run()` specialization for the reference backward data implementation.
+///
+/// @tparam SIGNATURE The signature of the operation to perform. Must be backwards data.
+/// @returns RunResult about how the operation completed (or not).
+///
+/// @see run()
+template <auto SIGNATURE>
+[[nodiscard]] RunResult run(RefConvBwdDataInstance<SIGNATURE> auto& conv,
+                            const Args<SIGNATURE>& args,
+                            const Inputs<SIGNATURE>& inputs,
+                            const Outputs<SIGNATURE>& outputs)
+{
+    return detail::run(conv, args, outputs.input, inputs.weight, inputs.output);
+}
+
 } // namespace ck_tile::builder::test
diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp
index b58945f5ce..c71737ee4f 100644
--- a/experimental/builder/include/ck_tile/builder/types.hpp
+++ b/experimental/builder/include/ck_tile/builder/types.hpp
@@ -241,6 +241,13 @@ enum class ConvAlgorithmSpecialization
     MULTIPLE_D
 };
 
+// StreamK work distribution strategy for the tile partitioner.
+enum class StreamKReductionStrategy
+{
+    LINEAR,
+    TREE
+};
+
 // to_string methods for enum classes
 inline std::string_view to_string(DataType dt)
 {
@@ -470,6 +477,17 @@ inline std::string_view to_string(TensorLayout layout)
     }
 }
 
+inline std::string_view to_string(StreamKReductionStrategy s)
+{
+    using enum StreamKReductionStrategy;
+    switch(s)
+    {
+    case LINEAR: return "LINEAR";
+    case TREE: return "TREE";
+    default: return "Unknown";
+    }
+}
+
 // ostream operator overloads for enum classes
 inline std::ostream& operator<<(std::ostream& os, DataType dt) { return os << to_string(dt); }
 
@@ -513,4 +531,9 @@ inline std::ostream& operator<<(std::ostream& os, TensorLayout layout)
     return os << to_string(layout);
 }
 
+inline std::ostream& operator<<(std::ostream& os, StreamKReductionStrategy s)
+{
+    return os << to_string(s);
+}
+
 } // namespace ck_tile::builder
diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt
index c12375eeb3..37f3142ae9 100644
--- a/experimental/builder/test/CMakeLists.txt
+++ b/experimental/builder/test/CMakeLists.txt
@@ -189,6 +189,7 @@ set(BWD_WEIGHT_TESTS
     conv/ck/test_ckb_conv_bwd_weight_xdl_cshuffle_v3.cpp
     conv/ck/test_ckb_conv_bwd_weight_dl.cpp
     conv/ck_tile/test_ckb_conv_bwd_weight_2d_fp16_v3.cpp
+    conv/ck_tile/test_ckb_conv_bwd_weight_2d_fp16_streamk.cpp
 )
 
 if (CK_USE_WMMA)
diff --git a/experimental/builder/test/conv/ck_tile/test_ckb_conv_bwd_weight_2d_fp16_streamk.cpp b/experimental/builder/test/conv/ck_tile/test_ckb_conv_bwd_weight_2d_fp16_streamk.cpp
new file mode 100644
index 0000000000..4d398ecc99
--- /dev/null
+++ b/experimental/builder/test/conv/ck_tile/test_ckb_conv_bwd_weight_2d_fp16_streamk.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/builder/testing/conv/bwd_weight.hpp"
+#include "ck_tile/builder/testing/conv/ck_tile.hpp"
+#include "ck_tile/builder/testing/conv/reference.hpp"
+#include "ck_tile/host/device_prop.hpp"
+#include "utils/ckb_conv_tile_test_configs.hpp"
+#include "utils/ckb_conv_test_utils.hpp"
+#include "testing_utils.hpp"
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+namespace cku = ck_tile::builder::test_utils;
+
+using enum ck_tile::builder::TensorLayout;
+using ck_tile::test::MatchesReference;
+using ck_tile::test::SuccessfulRun;
+
+constexpr auto SIGNATURE = cku::ConvSignature{.spatial_dim = 2,
+                                              .direction   = ckb::ConvDirection::BACKWARD_WEIGHT,
+                                              .data_type   = ckb::DataType::FP16,
+                                              .accumulation_data_type = ckb::DataType::FP32,
+                                              .input  = {.config = {.layout = NHWGC}},
+                                              .weight = {.config = {.layout = GKYXC}},
+                                              .output = {.config = {.layout = NHWGK}}};
+
+constexpr auto ALGORITHM =
+    cku::ConvAlgorithm_Tile_GroupedConvolutionKernel_StreamK{}
+        .with_tile_specializations(ckb::TileConvSpecialization::DEFAULT)
+        .with_tile_thread_block(cku::TileThreadBlock_128x128x32)
+        .with_tile_block_gemm(cku::TileBlockGemmDesc_16x16_v3_intrawave)
+        .with_tile_transfer(cku::TileTransfer_4x4x4)
+        .with_tile_optimizations(ckt::TileOptimizations{.num_groups_to_merge = 1,
+                                                        .split_image         = false,
+                                                        .explicit_gemm       = false,
+                                                        .two_stage           = false})
+        .with_streamk(ckt::TileStreamKConfig{
+            .reduction_strategy = ckb::StreamKReductionStrategy::TREE, .persistent = false});
+
+using Builder  = ckb::ConvBuilder<SIGNATURE, ALGORITHM>;
+using Instance = Builder::Instance;
+
+using Reference = ckb::ConvBuilder<SIGNATURE, ckt::ConvAlgorithm_Reference{}>::Instance;
+
+TEST(BwdWeight_2D_FP16_NHWGC_StreamK, Create)
+{
+    cku::run_ck_tile_test<Builder>({
+        "grouped_convolution_backward_weight",
+        "fp16",
+        "NHWGC_GKYXC_NHWGK",
+        "128x128x32",
+        "2x2",
+        "16x16x16",
+        "Default",
+        "Intrawave",
+        "CShuffleEpilogue",
+        "pipeline_AgBgCrCompV3",
+        "DoubleSmemBuffer_0",
+        "NumWaveGroups_1",
+        "MergedGroups_1",
+        "SplitImage_0",
+        "ExplicitGemm_0",
+        "StreamK",
+    });
+}
+
+TEST(BwdWeight_2D_FP16_NHWGC_StreamK, Execution)
+{
+    ckt::Args<SIGNATURE> args = {
+        .lengths =
+            {
+                .batch_size      = 2,
+                .groups          = 4,
+                .input_channels  = 32,
+                .output_channels = 48,
+                .image           = {.width = 32, .height = 56},
+                .filter          = {.width = 3, .height = 3},
+            },
+        .filter_strides     = {.width = 1, .height = 1},
+        .filter_dilation    = {.width = 1, .height = 1},
+        .input_left_pad     = {.width = 0, .height = 0},
+        .input_right_pad    = {.width = 0, .height = 0},
+        .a_elementwise_op   = {},
+        .b_elementwise_op   = {},
+        .cde_elementwise_op = {},
+    };
+
+    auto inputs    = ckt::alloc_inputs(args);
+    auto outputs   = ckt::alloc_outputs(args);
+    auto reference = ckt::alloc_outputs(args);
+
+    ckt::init_inputs(args, inputs.get());
+
+    auto conv = Instance{};
+    EXPECT_THAT(ckt::run(conv, args, inputs.get(), outputs.get()), SuccessfulRun());
+
+    auto ref_conv = Reference{};
+    EXPECT_THAT(ckt::run(ref_conv, args, inputs.get(), reference.get()), SuccessfulRun());
+
+    EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get()));
+}
diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp
index 2b9db31fa2..c24eb7d5af 100644
--- a/experimental/builder/test/impl/conv_algorithm_types.hpp
+++ b/experimental/builder/test/impl/conv_algorithm_types.hpp
@@ -382,6 +382,15 @@ struct TileOptimizations
 };
 static_assert(ckb::TileOptimizationsDescriptor<TileOptimizations>);
 
+struct TileStreamKConfig
+{
+    // StreamK reduction strategy (Linear or Tree).
+    StreamKReductionStrategy reduction_strategy;
+    // Use persistent DP (true) or non-persistent DP (false).
+    bool persistent;
+};
+static_assert(ckb::StreamKDescriptor<TileStreamKConfig>);
+
 struct TileConvSpecialization_
 {
     TileConvSpecialization specialization;
@@ -407,6 +416,11 @@ struct TileOptimizations_
     TileOptimizations optimizations;
 };
 
+struct TileStreamK_
+{
+    TileStreamKConfig streamk;
+};
+
 // Factory
 
 template <typename... Components>
@@ -614,6 +628,15 @@ struct ConvAlgorithmTemplate : Components...
         result.optimizations = o;
         return result;
     }
+
+    template <typename SK>
+    constexpr auto with_streamk(const SK& sk) const
+    {
+        static_assert(std::is_base_of_v<TileStreamK_, ConvAlgorithmTemplate>);
+        auto result    = *this;
+        result.streamk = sk;
+        return result;
+    }
 };
 
 // Fwd algorithm types
@@ -674,6 +697,15 @@ using ConvAlgorithm_Tile_GroupedConvolutionKernel = ConvAlgorithmTemplate<TileTh
                                                                           TileConvSpecialization_,
                                                                           TileOptimizations_>;
 
+// CK Tile algorithm with StreamK work distribution
+using ConvAlgorithm_Tile_GroupedConvolutionKernel_StreamK =
+    ConvAlgorithmTemplate<TileThreadBlock_,
+                          TileBlockGemm_,
+                          TileTransfer_,
+                          TileConvSpecialization_,
+                          TileOptimizations_,
+                          TileStreamK_>;
+
 // Reference algorithm descriptor - for GPU reference validation
 // This is a simple algorithm that requires no complex configuration,
 // just a specialization marker to identify it as a reference implementation.
diff --git a/experimental/builder/test/test_bwd_weight_instance_traits.cpp b/experimental/builder/test/test_bwd_weight_instance_traits.cpp
index 63317315b3..58ac36a06d 100644
--- a/experimental/builder/test/test_bwd_weight_instance_traits.cpp
+++ b/experimental/builder/test/test_bwd_weight_instance_traits.cpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
 #include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp"
 #include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp"
 
 namespace {
 
@@ -228,6 +229,130 @@ TEST(InstanceTraits, TileInstanceStringReturnsCorrectFormat)
                                ",bf16"        // EDataType
                                ",EmptyTuple"  // DsDataType
                                ",PassThrough" // CDEElementwiseOperation
+                               ",0"           // IsStreamK
+                               ">";
+
+    EXPECT_EQ(instance_str, expected_str);
+}
+
+TEST(InstanceTraits, TileStreamKInstanceStringReturnsCorrectFormat)
+{
+    using GroupedConvTraitsType =
+        ck_tile::GroupedConvTraits<2 /*NDimSpatial*/,
+                                   ck_tile::ConvolutionSpecialization::Default /*ConvSpec*/,
+                                   ck_tile::tensor_layout::convolution::NHWGC /*InLayout*/,
+                                   ck_tile::tensor_layout::convolution::GKYXC /*WeiLayout*/,
+                                   ck_tile::tuple<> /*DsLayout*/,
+                                   ck_tile::tensor_layout::convolution::NHWGK /*OutLayout*/,
+                                   4 /*VectorSizeA*/,
+                                   4 /*VectorSizeB*/,
+                                   4 /*VectorSizeC*/,
+                                   1 /*NumGroupsToMerge*/,
+                                   false /*EnableSplitImage*/,
+                                   false /*ExplicitGemm*/>;
+
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<128 /*M_Tile*/, 128 /*N_Tile*/, 32 /*K_Tile*/>,
+        ck_tile::sequence<4 /*M_Warp*/, 1 /*N_Warp*/, 1 /*K_Warp*/>,
+        ck_tile::sequence<16 /*M_Warp_Tile*/, 16 /*N_Warp_Tile*/, 16 /*K_Warp_Tile*/>>;
+
+    using TilePartitioner = ck_tile::StreamKTilePartitioner<GemmShape,
+                                                            ck_tile::StreamKReductionStrategy::Tree,
+                                                            false /*Persistent*/>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
+        GroupedConvTraitsType::FixedGemmParams::kPadM,
+        GroupedConvTraitsType::FixedGemmParams::kPadN,
+        GroupedConvTraitsType::FixedGemmParams::kPadK,
+        false /*DoubleSmemBuffer*/,
+        typename GroupedConvTraitsType::AsLayoutBwdWeight,
+        typename GroupedConvTraitsType::BsLayoutBwdWeight,
+        typename GroupedConvTraitsType::CLayoutBwdWeight,
+        GroupedConvTraitsType::FixedGemmParams::TransposeC,
+        GroupedConvTraitsType::FixedGemmParams::UseStructuredSparsity,
+        GroupedConvTraitsType::FixedGemmParams::Persistent,
+        1 /*NumWaveGroups*/>;
+
+    using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<
+        ck_tile::bf16_t /*OutDataType*/,
+        ck_tile::bf16_t /*InDataType*/,
+        float /*AccDataType*/,
+        GemmShape,
+        GemmUniversalTraits,
+        ck_tile::GemmPipelineScheduler::Intrawave /*scheduler*/,
+        ck_tile::element_wise::PassThrough /*AElementwiseOperation*/,
+        ck_tile::element_wise::PassThrough /*BElementwiseOperation*/,
+        ck_tile::bf16_t /*WeiDataType*/,
+        GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+        GroupedConvTraitsType::VectorSizeA,
+        GroupedConvTraitsType::VectorSizeB>;
+
+    using GemmPipeline = typename ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>;
+
+    using ConvEpilogue = ck_tile::CShuffleEpilogue<
+        ck_tile::CShuffleEpilogueProblem<ck_tile::bf16_t /*OutDataType*/,
+                                         ck_tile::bf16_t /*InDataType*/,
+                                         ck_tile::tuple<> /*DsDataType*/,
+                                         float /*AccDataType*/,
+                                         ck_tile::bf16_t /*WeiDataType*/,
+                                         typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                                         typename GroupedConvTraitsType::FixedGemmParams::ELayout,
+                                         ck_tile::element_wise::PassThrough /*CDElementWise*/,
+                                         128 /*MPerBlock*/,
+                                         128 /*NPerBlock*/,
+                                         4 /*M_Warp*/,
+                                         1 /*N_Warp*/,
+                                         16 /*M_Warp_Tile*/,
+                                         16 /*N_Warp_Tile*/,
+                                         16 /*K_Warp_Tile*/,
+                                         GroupedConvTraitsType::FixedGemmParams::TransposeC,
+                                         1 /*kNumWaveGroups*/,
+                                         GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+                                         GroupedConvTraitsType::VectorSizeC>>;
+
+    using GroupedConvBwdWeiKernel =
+        ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
+                                                        TilePartitioner,
+                                                        GemmPipeline,
+                                                        ConvEpilogue>;
+
+    std::string instance_str = ck_tile::reflect::instance_string<GroupedConvBwdWeiKernel>();
+
+    std::string expected_str = "GroupedConvolutionBackwardWeightKernel"
+                               "<2"           // NDimSpatial
+                               ",Default"     // ConvSpecialization
+                               ",NHWGC"       // InLayout
+                               ",GKYXC"       // WeiLayout
+                               ",EmptyTuple"  // DsLayout
+                               ",NHWGK"       // OutLayout
+                               ",4"           // VectorSizeA
+                               ",4"           // VectorSizeB
+                               ",4"           // VectorSizeC
+                               ",1"           // NumGroupsToMerge
+                               ",0"           // EnableSplitImage
+                               ",0"           // ExplicitGemm
+                               ",128"         // MPerBlock
+                               ",128"         // NPerBlock
+                               ",32"          // KPerBlock
+                               ",4"           // MWarp
+                               ",1"           // NWarp
+                               ",1"           // KWarp
+                               ",16"          // MWarpTile
+                               ",16"          // NWarpTile
+                               ",16"          // KWarpTile
+                               ",bf16"        // ADataType
+                               ",bf16"        // BDataType
+                               ",COMPUTE_V3"  // BlkGemmPipelineVer
+                               ",Intrawave"   // BlkGemmPipeSched
+                               ",0"           // DoubleSmemBuffer
+                               ",1"           // NumWaveGroups
+                               ",fp32"        // AccDataType
+                               ",bf16"        // EDataType
+                               ",EmptyTuple"  // DsDataType
+                               ",PassThrough" // CDEElementwiseOperation
+                               ",1"           // IsStreamK
+                               ",2"           // ReductionStrategy (Tree=2)
+                               ",0"           // PersistentDP
                                ">";
 
     EXPECT_EQ(instance_str, expected_str);
diff --git a/experimental/grouped_convolution_tile_instances/CMakeLists.txt b/experimental/grouped_convolution_tile_instances/CMakeLists.txt
index a2a4568c5d..94639c65ca 100644
--- a/experimental/grouped_convolution_tile_instances/CMakeLists.txt
+++ b/experimental/grouped_convolution_tile_instances/CMakeLists.txt
@@ -3,13 +3,13 @@
 
 if(GPU_TARGETS MATCHES "gfx9")
   # Generate instances using python script if instance directories don't exist
-  set(INSTANCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/instances)
+  set(INSTANCES_DIR ${CMAKE_CURRENT_BINARY_DIR})
   if(NOT EXISTS ${INSTANCES_DIR}/forward OR
      NOT EXISTS ${INSTANCES_DIR}/backward_weight OR
      NOT EXISTS ${INSTANCES_DIR}/backward_data)
     find_package(Python3 COMPONENTS Interpreter Development)
     execute_process(
-      COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py --mode=tests
+      COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py --mode=tests --instances_dir=${CMAKE_CURRENT_BINARY_DIR}
       RESULT_VARIABLE ret
       OUTPUT_VARIABLE output
       ERROR_VARIABLE error
@@ -21,16 +21,21 @@ if(GPU_TARGETS MATCHES "gfx9")
   endif()
 
   # Find cpp files and create lib for instances
-  file(GLOB_RECURSE GROUPED_CONV_FWD_TILE "instances/forward/*.cpp")
+  file(GLOB_RECURSE GROUPED_CONV_FWD_TILE "${CMAKE_CURRENT_BINARY_DIR}/forward/*.cpp")
   add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE})
   target_include_directories(device_grouped_conv_fwd_tile_instances PRIVATE
         "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils")
   target_compile_options(device_grouped_conv_fwd_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0)
 
-  file(GLOB_RECURSE GROUPED_CONV_BWD_WEIGHT_TILE "instances/backward_weight/*.cpp")
+  file(GLOB_RECURSE GROUPED_CONV_BWD_WEIGHT_TILE "${CMAKE_CURRENT_BINARY_DIR}/backward_weight/*.cpp")
   add_instance_library(device_grouped_conv_bwd_weight_tile_instances ${GROUPED_CONV_BWD_WEIGHT_TILE})
   target_include_directories(device_grouped_conv_bwd_weight_tile_instances PRIVATE
         "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils")
-
   target_compile_options(device_grouped_conv_bwd_weight_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0)
+
+  file(GLOB_RECURSE GROUPED_CONV_BWD_DATA_TILE "${CMAKE_CURRENT_BINARY_DIR}/backward_data/*.cpp")
+  add_instance_library(device_grouped_conv_bwd_data_tile_instances ${GROUPED_CONV_BWD_DATA_TILE})
+  target_include_directories(device_grouped_conv_bwd_data_tile_instances PRIVATE
+        "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils")
+  target_compile_options(device_grouped_conv_bwd_data_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0)
 endif()
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_bf16.conf
index 4ee0de66d1..0623e6358c 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_bf16.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_bf16.conf
@@ -1,82 +1,82 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,2,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,2,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp16.conf
index 4ee0de66d1..ff096e2275 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp16.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp16.conf
@@ -1,82 +1,82 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,2,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,2,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp32.conf
index 0391d33eb7..e4c18b5b2f 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp32.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp32.conf
@@ -1,70 +1,70 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,32,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,32,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_bf16.conf
index 4ee0de66d1..16a93f0066 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_bf16.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_bf16.conf
@@ -1,82 +1,82 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,2,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,2,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp16.conf
index 4ee0de66d1..39893398a0 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp16.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp16.conf
@@ -1,82 +1,82 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,2,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,2,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp32.conf
index 0391d33eb7..9f6d82f4ed 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp32.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp32.conf
@@ -1,70 +1,70 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,32,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,32,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,false,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,32,8,8,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,2,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_bf16.conf
index c5e1b20cff..47e2daa82b 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_bf16.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_bf16.conf
@@ -1,16 +1,16 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp16.conf
index c5e1b20cff..ff91db93e8 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp16.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp16.conf
@@ -1,16 +1,16 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp32.conf
index fd3a1bbda8..c7fd5038b8 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp32.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp32.conf
@@ -1,14 +1,14 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,32,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<3,NDHWGK,GKZYXC,EmptyTuple,NDHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_bf16.conf
index c5e1b20cff..f46c741ee6 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_bf16.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_bf16.conf
@@ -1,16 +1,16 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,bf16,bf16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,bf16,bf16,fp32,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,bf16,bf16,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp16.conf
index c5e1b20cff..adeb3b5ef3 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp16.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp16.conf
@@ -1,16 +1,16 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,32,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,32,1,8),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,2,1),Seq(0,2,1),Seq(0,2,1),1,8,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,8,1,32),1,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,64,16,16,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),8,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp16,fp16,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp16,fp16,fp32,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,64,16,16,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,16,4,true,Seq(4,4,16),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp16,fp16,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp32.conf
index fd3a1bbda8..468b4515ac 100644
--- a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp32.conf
+++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp32.conf
@@ -1,14 +1,14 @@
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
-DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,16,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,1),Seq(0,2,1),Seq(0,2,1),1,1,4,true,1,1,Seq(1,32,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,4,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,16,64,32,8,8,16,16,1,4,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,true,Seq(4,4,1),Seq(0,2,1),Seq(0,2,1),1,4,8,true,1,1,Seq(1,16,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,64,64,16,32,8,8,16,16,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,8,true,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,true,1,1,Seq(1,16,1,4),1,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,8,4),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,16,1,16),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,2,1,true,1,1,Seq(1,32,1,8),2,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,128,32,16,4,4,32,32,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,4,4),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,32,1,8),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,32,8,8,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,4,true,Seq(4,2,8),Seq(0,2,1),Seq(0,2,1),1,8,1,true,1,1,Seq(1,64,1,4),4,1,Default,fp32,fp32,1,1>
+DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle<2,NHWGK,GKYXC,EmptyTuple,NHWGC,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,1,1,1,256,64,16,16,4,4,16,16,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,true,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,1,1,true,1,1,Seq(1,16,1,16),1,1,Default,fp32,fp32,1,1>
diff --git a/experimental/grouped_convolution_tile_instances/generate_instances.py b/experimental/grouped_convolution_tile_instances/generate_instances.py
index 9de431ac73..796e6b9158 100755
--- a/experimental/grouped_convolution_tile_instances/generate_instances.py
+++ b/experimental/grouped_convolution_tile_instances/generate_instances.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
+import shutil
 from pathlib import Path
 
 class ConvInstanceTemplateParams:
@@ -137,6 +138,13 @@ def parse_instance_string(instance_string):
     
     return params
 
+def copy_includes(instances_path):
+    inc_dir = Path(__file__).resolve().parent
+    output_dir = Path(instances_path)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    shutil.copy(f"{inc_dir}/include/instance_includes.inc", instances_path)
+    shutil.copy(f"{inc_dir}/include/instance_run.inc", instances_path)
+    shutil.copy(f"{inc_dir}/include/signatures.hpp", instances_path)
 
 def generate_calls_inc(instances, problem_name, direction, filter_pattern):
     generate_dir = Path(__file__).resolve().parent
@@ -168,17 +176,17 @@ def generate_defs_inc(instances, problem_name, signature, direction, filter_patt
 
 
 def generate_conv_cpp(
-    instances, problem_name, config, direction, signature_name, filter_pattern):
+    instances, problem_name, config, direction, signature_name, filter_pattern, instances_path):
     for instance in instances:
         if problem_name.find(filter_pattern) == -1:
             break
         instance_name = problem_name + "_" + str(instance.id)
-        generate_dir = Path(__file__).resolve().parent
-        directory_path = Path(f"{generate_dir}/instances/{direction}/{config}")
+        directory_path = Path(f"{instances_path}/{direction}/{config}")
         directory_path.mkdir(parents=True, exist_ok=True)
-        template_file = "grouped_convolution_tile.cpp.in"
+        parent_dir = Path(__file__).resolve().parent
+        template_file = "include/grouped_convolution_tile.cpp.in"
         
-        with open(f"{generate_dir}/instances/{template_file}", "r",) as f:
+        with open(f"{parent_dir}/{template_file}", "r",) as f:
             content = f.read()
 
             content = content.replace("gen_signature", signature_name)
@@ -189,7 +197,7 @@ def generate_conv_cpp(
             content = content.replace("gen_block_transfer", instance.get_block_transfer())
             content = content.replace("gen_optimizations", instance.get_optimizations())
 
-        with open(f"{generate_dir}/instances/{direction}/{config}/{instance_name}.cpp","w",) as f:
+        with open(f"{instances_path}/{direction}/{config}/{instance_name}.cpp","w",) as f:
             f.write(content)
 
 
@@ -460,11 +468,134 @@ def parse_bwd_weight_instances(instances, problem_name):
 
 def parse_bwd_data_instances(instances, problem_name):
     convs = []
-    print("Parsing backward data instances is not supported yet, skipping all instances.")
-    # TODO: Implement parsing logic for backward data instances.
+
+    for instance_id, instance in enumerate(instances):
+        if instance.find("#") != -1 or instance.find(";") != -1:
+            continue
+
+        start = instance.index('<') + 1
+        end = instance.rindex('>')
+        params_str = instance[start:end]
+        args = parse_instance_string(params_str)
+
+        is_v1_instance = instance.find("Xdl_CShuffle<") != -1
+        
+        if is_v1_instance:
+            if len(args) != 51:
+                raise RuntimeError(f"Wrong number of parameters in the V1 XDL CShuffle instance string: {instance}\n" + 
+                                    f"Expected 51 parameters for V1 instance. Found {len(args)} parameters.")
+        else:
+            raise RuntimeError(f"Only V1 XDL CShuffle instances are supported for backward data. Found instance: {instance}")
+        
+        spec = args[13]
+        block_size = int(args[17])
+        m_per_block = int(args[18])
+        n_per_block = int(args[19])
+        k_per_block = int(args[20])
+        ak1 = int(args[21])
+        bk1 = int(args[22])
+        m_per_xdl = int(args[23])
+        n_per_xdl = int(args[24])
+        m_xdl_per_wave = int(args[25])
+        n_xdl_per_wave = int(args[26])
+        a_scalar_per_vector = int(args[31])
+        b_scalar_per_vector = int(args[38])
+        c_scalar_per_vector = int(args[44])
+
+        if ak1 != bk1:
+            raise RuntimeError(f"Not supported instance {instance_id} since ak1 != bk1. ak1: {ak1}, bk1: {bk1} in instance: {instance}")
+        
+        k1 = min(ak1, bk1)
+
+        # TODO: Do we need split image for 3D bwd data convs?
+        split_image = False
+
+        # Default optimization parameters
+        num_groups_to_merge = 1
+        is_two_stage_instance = False
+        is_explicit_gemm = False
+        num_wave_groups = 1
+        direct_load = False
+
+        # Block GEMM pipeline parameters
+        block_gemm_pipeline_scheduler = args[46]
+        if block_gemm_pipeline_scheduler == "Default":
+            block_gemm_pipeline_scheduler = "Intrawave"
+
+        blk_gemm_pipeline_version = "v1"
+        if block_gemm_pipeline_scheduler == "Interwave":
+            blk_gemm_pipeline_version = "v1"
+
+        # Sanity check for Block GEMM pipeline parameters
+        # Scheduler must be either Intrawave or Interwave.
+        # Version must be from v1 to v5
+        if block_gemm_pipeline_scheduler not in ["Intrawave", "Interwave"]:
+            raise RuntimeError(f"Invalid Block GEMM pipeline scheduler: {block_gemm_pipeline_scheduler} in instance: {instance}")
+        if blk_gemm_pipeline_version not in ["v1", "v2", "v3", "v4", "v5"]:
+            raise RuntimeError(f"Invalid Block GEMM pipeline version: {blk_gemm_pipeline_version} in instance: {instance}")
+
+        double_smem_buffer = blk_gemm_pipeline_version == "v4"
+        scheduler = block_gemm_pipeline_scheduler
+        pipeline_version = blk_gemm_pipeline_version.upper()
+
+        # Old CK pipeline version V5 maps to V6 for CK Tile
+        if pipeline_version == "V5":
+            pipeline_version = "V6"
+
+        if direct_load:
+            if pipeline_version == "V1":
+                pipeline_version = "ASYNC_V1"
+            elif pipeline_version == "V4":
+                pipeline_version = "ASYNC_V4"
+            else:
+                raise RuntimeError(
+                    f"Not supported pipeline for direct load: pipeline_version={pipeline_version} in instance: {instance}"
+                )
+
+        m_warp = int(m_per_block / (m_per_xdl * m_xdl_per_wave))
+        n_warp = int(n_per_block / (n_per_xdl * n_xdl_per_wave))
+        warp_size = 64
+        k_warp = int(block_size / (warp_size * m_warp * n_warp))
+        dtype = get_dtype(problem_name)
+
+        k_per_xdl = max(k1, get_k_mfma(dtype, m_per_xdl, n_per_xdl))
+
+        if check_vectors(a_scalar_per_vector, b_scalar_per_vector, c_scalar_per_vector) == False:
+            print(f"Skipping instance {instance_id} with irregular load since it's not supported yet.")
+            continue
+        if pipeline_version == "V6":
+            print(f"Skipping instance {instance_id} with V6 since it's not supported yet.")
+            continue
+
+        # Check vector sizes for A and B tensors - we cannot oversubscribe.
+        num_tile_elements_a = m_per_xdl * k_per_xdl
+        num_tile_elements_b = n_per_xdl * k_per_xdl
+        max_vector_size_a = max(1, num_tile_elements_a // block_size)
+        max_vector_size_b = max(1, num_tile_elements_b // block_size)
+        a_scalar_per_vector = min(a_scalar_per_vector, max_vector_size_a)
+        b_scalar_per_vector = min(b_scalar_per_vector, max_vector_size_b)
+
+        conv = ConvInstanceTemplateParams(
+            spec,
+            [m_per_block, n_per_block, k_per_block],
+            [m_warp, n_warp, k_warp],
+            [m_per_xdl, n_per_xdl, k_per_xdl],
+            double_smem_buffer,
+            num_wave_groups,
+            is_two_stage_instance,
+            pipeline_version,
+            scheduler,
+            [a_scalar_per_vector, b_scalar_per_vector, c_scalar_per_vector],
+            num_groups_to_merge,
+            split_image,
+            is_explicit_gemm,
+            instance_id,
+        )
+        convs.append(conv)
+            
     return convs
 
-def generate_instances_fwd(instances, problem_name, config, filter_pattern):
+def generate_instances_fwd(instances, problem_name, config, filter_pattern, instances_path):
     direction = "forward"
     signature_name = f"SIGNATURE_{config.upper()}_FWD"
     instances = parse_fwd_instances(instances, problem_name)
@@ -474,13 +605,13 @@ def generate_instances_fwd(instances, problem_name, config, filter_pattern):
         problem_name,
         signature_name,
         direction,
-        filter_pattern,
+        filter_pattern
     )
     generate_conv_cpp(
-        instances, problem_name, config, direction, signature_name, filter_pattern
+        instances, problem_name, config, direction, signature_name, filter_pattern, instances_path
     )
 
-def generate_instances_bwd_weight(instances, problem_name, config, filter_pattern):
+def generate_instances_bwd_weight(instances, problem_name, config, filter_pattern, instances_path):
     direction = "backward_weight"
     signature_name = f"SIGNATURE_{config.upper()}_BWD_WEIGHT"
     instances = parse_bwd_weight_instances(instances, problem_name)
@@ -490,13 +621,13 @@ def generate_instances_bwd_weight(instances, problem_name, config, filter_patter
         problem_name,
         signature_name,
         direction,
-        filter_pattern,
+        filter_pattern
     )
     generate_conv_cpp(
-        instances, problem_name, config, direction, signature_name, filter_pattern
+        instances, problem_name, config, direction, signature_name, filter_pattern, instances_path
     )
 
-def generate_instances_bwd_data(instances, problem_name, config, filter_pattern):
+def generate_instances_bwd_data(instances, problem_name, config, filter_pattern, instances_path):
     direction = "backward_data"
     signature_name = f"SIGNATURE_{config.upper()}_BWD_DATA"
     instances = parse_bwd_data_instances(instances, problem_name)
@@ -506,13 +637,13 @@ def generate_instances_bwd_data(instances, problem_name, config, filter_pattern)
         problem_name,
         signature_name,
         direction,
-        filter_pattern,
+        filter_pattern
     )
     generate_conv_cpp(
-        instances, problem_name, config, direction, signature_name, filter_pattern
+        instances, problem_name, config, direction, signature_name, filter_pattern, instances_path
     )
 
-def process_direction(configs, direction, generate_func, configs_prefix, filter_pattern):
+def process_direction(configs, direction, generate_func, configs_prefix, filter_pattern, instances_path):
     """Helper function to process a single direction."""
     for config in configs:
         instances = []
@@ -531,7 +662,7 @@ def process_direction(configs, direction, generate_func, configs_prefix, filter_
         else:
             raise RuntimeError(f"Unknown direction: {direction}")
         
-        generate_func(instances, problem_name, config, filter_pattern)
+        generate_func(instances, problem_name, config, filter_pattern, instances_path)
 
 if __name__ == "__main__":
     fwd_configs = [
@@ -585,6 +716,12 @@ if __name__ == "__main__":
         default="all",
         help="Convolution direction for which to generate instances."
     )
+    parser.add_argument(
+        "--instances_dir",
+        type=str,
+        default="../build/experimental/grouped_convolution_tile_instances",
+        help="Directory store generated instances."
+    )
     args = parser.parse_args()
 
     # apply empty filter
@@ -598,15 +735,16 @@ if __name__ == "__main__":
     else:
         raise RuntimeError("wrong mode")
 
+    copy_includes(args.instances_dir)
     match args.direction:
         case "forward":
-            process_direction(fwd_configs, args.direction, generate_instances_fwd, configs_prefix, args.filter_pattern)
+            process_direction(fwd_configs, args.direction, generate_instances_fwd, configs_prefix, args.filter_pattern, args.instances_dir)
         case "backward_weight":
-            process_direction(bwd_weight_configs, args.direction, generate_instances_bwd_weight, configs_prefix, args.filter_pattern)
+            process_direction(bwd_weight_configs, args.direction, generate_instances_bwd_weight, configs_prefix, args.filter_pattern, args.instances_dir)
         case "backward_data":
-            process_direction(bwd_data_configs, args.direction, generate_instances_bwd_data, configs_prefix, args.filter_pattern)
+            process_direction(bwd_data_configs, args.direction, generate_instances_bwd_data, configs_prefix, args.filter_pattern, args.instances_dir)
         case "all":
-            process_direction(fwd_configs, "forward", generate_instances_fwd, configs_prefix, args.filter_pattern)
-            process_direction(bwd_weight_configs, "backward_weight", generate_instances_bwd_weight, configs_prefix, args.filter_pattern)
-            process_direction(bwd_data_configs, "backward_data", generate_instances_bwd_data, configs_prefix, args.filter_pattern)
+            process_direction(fwd_configs, "forward", generate_instances_fwd, configs_prefix, args.filter_pattern, args.instances_dir)
+            process_direction(bwd_weight_configs, "backward_weight", generate_instances_bwd_weight, configs_prefix, args.filter_pattern, args.instances_dir)
+            process_direction(bwd_data_configs, "backward_data", generate_instances_bwd_data, configs_prefix, args.filter_pattern, args.instances_dir)
 
diff --git a/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_tile.cpp.in b/experimental/grouped_convolution_tile_instances/include/grouped_convolution_tile.cpp.in
similarity index 100%
rename from experimental/grouped_convolution_tile_instances/instances/grouped_convolution_tile.cpp.in
rename to experimental/grouped_convolution_tile_instances/include/grouped_convolution_tile.cpp.in
diff --git a/experimental/grouped_convolution_tile_instances/include/instance_includes.inc b/experimental/grouped_convolution_tile_instances/include/instance_includes.inc
new file mode 100644
index 0000000000..8a64bca209
--- /dev/null
+++ b/experimental/grouped_convolution_tile_instances/include/instance_includes.inc
@@ -0,0 +1,8 @@
+#include "../../builder/test/utils/ckb_conv_tile_test_configs.hpp"
+#include "ck_tile/builder/testing/conv/fwd.hpp"
+#include "ck_tile/builder/testing/conv/bwd_weight.hpp"
+#include "ck_tile/builder/testing/conv/bwd_data.hpp"
+#include "signatures.hpp"
+
+namespace cku = ck_tile::builder::test_utils;
+namespace ckf = ck_tile::builder::factory;
diff --git a/experimental/grouped_convolution_tile_instances/instances/instance_run.inc b/experimental/grouped_convolution_tile_instances/include/instance_run.inc
similarity index 100%
rename from experimental/grouped_convolution_tile_instances/instances/instance_run.inc
rename to experimental/grouped_convolution_tile_instances/include/instance_run.inc
diff --git a/experimental/grouped_convolution_tile_instances/instances/instance_includes.inc b/experimental/grouped_convolution_tile_instances/include/signatures.hpp
similarity index 96%
rename from experimental/grouped_convolution_tile_instances/instances/instance_includes.inc
rename to experimental/grouped_convolution_tile_instances/include/signatures.hpp
index b5e0216bd6..a14c0a8b1c 100644
--- a/experimental/grouped_convolution_tile_instances/instances/instance_includes.inc
+++ b/experimental/grouped_convolution_tile_instances/include/signatures.hpp
@@ -1,14 +1,17 @@
-#include "../../builder/test/utils/ckb_conv_tile_test_configs.hpp"
-#include "ck_tile/builder/testing/conv/fwd.hpp"
-#include "ck_tile/builder/testing/conv/bwd_weight.hpp"
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <tuple>
+
+#include "../../builder/test/impl/conv_signature_types.hpp"
 #include "ck_tile/builder/testing/conv/ck_tile.hpp"
 
+namespace ck_tile::builder::profiling {
+
 namespace ckb = ck_tile::builder;
 namespace ckt = ck_tile::builder::test;
-namespace cku = ck_tile::builder::test_utils;
-namespace ckf = ck_tile::builder::factory;
-
-namespace ck_tile::builder::profiling {
 
 constexpr auto SIGNATURE_NHWGC_FP32_FWD =
     ckt::ConvSignature{.spatial_dim            = 2,
@@ -64,15 +67,9 @@ constexpr auto SIGNATURE_NDHWGC_FP16_FWD =
                        .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
                        .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
 
-// Backward Weight Signatures
-constexpr auto SIGNATURE_NHWGC_FP32_BWD_WEIGHT =
-    ckt::ConvSignature{.spatial_dim            = 2,
-                       .direction              = ckb::ConvDirection::BACKWARD_WEIGHT,
-                       .data_type              = ckb::DataType::FP32,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
+/////////////////////////////////////////
+// BWD WEIGHT signatures
+//////////////////////////////////////////
 
 constexpr auto SIGNATURE_NHWGC_BF16_BWD_WEIGHT =
     ckt::ConvSignature{.spatial_dim            = 2,
@@ -92,14 +89,14 @@ constexpr auto SIGNATURE_NHWGC_FP16_BWD_WEIGHT =
                        .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
                        .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
 
-constexpr auto SIGNATURE_NDHWGC_FP32_BWD_WEIGHT =
-    ckt::ConvSignature{.spatial_dim            = 3,
+constexpr auto SIGNATURE_NHWGC_FP32_BWD_WEIGHT =
+    ckt::ConvSignature{.spatial_dim            = 2,
                        .direction              = ckb::ConvDirection::BACKWARD_WEIGHT,
                        .data_type              = ckb::DataType::FP32,
                        .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
+                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
+                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
+                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
 
 constexpr auto SIGNATURE_NDHWGC_BF16_BWD_WEIGHT =
     ckt::ConvSignature{.spatial_dim            = 3,
@@ -119,15 +116,18 @@ constexpr auto SIGNATURE_NDHWGC_FP16_BWD_WEIGHT =
                        .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
                        .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
 
-// Backward Data Signatures
-constexpr auto SIGNATURE_NHWGC_FP32_BWD_DATA =
-    ckt::ConvSignature{.spatial_dim            = 2,
-                       .direction              = ckb::ConvDirection::BACKWARD_DATA,
+constexpr auto SIGNATURE_NDHWGC_FP32_BWD_WEIGHT =
+    ckt::ConvSignature{.spatial_dim            = 3,
+                       .direction              = ckb::ConvDirection::BACKWARD_WEIGHT,
                        .data_type              = ckb::DataType::FP32,
                        .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
+                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
+                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
+                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
+
+/////////////////////////////////////////
+// BWD DATA signatures
+//////////////////////////////////////////
 
 constexpr auto SIGNATURE_NHWGC_BF16_BWD_DATA =
     ckt::ConvSignature{.spatial_dim            = 2,
@@ -147,14 +147,14 @@ constexpr auto SIGNATURE_NHWGC_FP16_BWD_DATA =
                        .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
                        .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
 
-constexpr auto SIGNATURE_NDHWGC_FP32_BWD_DATA =
-    ckt::ConvSignature{.spatial_dim            = 3,
+constexpr auto SIGNATURE_NHWGC_FP32_BWD_DATA =
+    ckt::ConvSignature{.spatial_dim            = 2,
                        .direction              = ckb::ConvDirection::BACKWARD_DATA,
                        .data_type              = ckb::DataType::FP32,
                        .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
+                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
+                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
+                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
 
 constexpr auto SIGNATURE_NDHWGC_BF16_BWD_DATA =
     ckt::ConvSignature{.spatial_dim            = 3,
@@ -174,4 +174,13 @@ constexpr auto SIGNATURE_NDHWGC_FP16_BWD_DATA =
                        .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
                        .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
 
+constexpr auto SIGNATURE_NDHWGC_FP32_BWD_DATA =
+    ckt::ConvSignature{.spatial_dim            = 3,
+                       .direction              = ckb::ConvDirection::BACKWARD_DATA,
+                       .data_type              = ckb::DataType::FP32,
+                       .accumulation_data_type = ckb::DataType::FP32,
+                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
+                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
+                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
+
 } // namespace ck_tile::builder::profiling
diff --git a/experimental/grouped_convolution_tile_instances/test-instances.py b/experimental/grouped_convolution_tile_instances/test-instances.py
index 05f0450768..f4fb819f2d 100755
--- a/experimental/grouped_convolution_tile_instances/test-instances.py
+++ b/experimental/grouped_convolution_tile_instances/test-instances.py
@@ -50,6 +50,7 @@ def compile_single_file(cpp_file: Path, project_root: Path, gpu_target: str, ver
             "-D__HIP_PLATFORM_AMD__",
             "-D CK_EXPERIMENTAL_BUILDER=ON",
             "-O3",
+            "-Wno-unknown-warning-option",
             *include_flags,
             str(cpp_file),
             "-o", str(output_file)
@@ -63,10 +64,15 @@ def compile_single_file(cpp_file: Path, project_root: Path, gpu_target: str, ver
                 timeout=300  # 5 minute timeout per file
             )
             
+            print(f"\n\n    Command: {' '.join(cmd)}\n") if verbose else None
+
             if result.returncode == 0:
                 return True, ""
             else:
                 # Extract the key error message
+                if verbose and result.stderr:
+                    print(f"    {result.stderr}")
+                    print()
                 error_output = result.stderr
                 return False, error_output
                 
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 879fb31ca5..7448c7a31a 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -63,8 +63,8 @@
 #define __gfx101__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
-    defined(__gfx10_3_generic__)
+    defined(__gfx1033__) || defined(__gfx1034__) || defined(__gfx1035__) || \
+    defined(__gfx1036__) || defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index 7191ad2c8a..97852531a9 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -125,8 +125,9 @@ inline bool is_gfx101_supported()
 inline bool is_gfx103_supported()
 {
     return ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx1031" ||
-           ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1034" ||
-           ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036";
+           ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1033" ||
+           ck::get_device_name() == "gfx1034" || ck::get_device_name() == "gfx1035" ||
+           ck::get_device_name() == "gfx1036";
 }
 
 inline bool is_wmma_supported()
diff --git a/include/ck/host_utility/flush_cache.hpp b/include/ck/host_utility/flush_cache.hpp
index 2d051233e4..25084bae85 100644
--- a/include/ck/host_utility/flush_cache.hpp
+++ b/include/ck/host_utility/flush_cache.hpp
@@ -476,16 +476,6 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
             hip_check_error(hipGetLastError());
             // end real kernel
 
-            //             hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
-            //             hip_check_error(hipEventSynchronize(stop));
-            //             float cur_time = 0;
-            //             hip_check_error(hipEventElapsedTime(&cur_time, start, stop));
-            // #if MEDIAN
-            //             times.insert(cur_time);
-            // #else
-            //             total_time += cur_time;
-            // #endif
-
 #if !defined(CK_USE_WMMA)
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
diff --git a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
index fd50e61f32..7ccebaf35a 100644
--- a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
@@ -137,13 +137,6 @@ transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-    // const auto out_grid_desc_gemmm_gemmn = transform_tensor_descriptor(
-    //     out_n_do_ho_wo_k_grid_desc,
-    //     make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
-    //                make_pass_through_transform(K)),
-    //     make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<3>{}),
-    //     make_tuple(Sequence<0>{}, Sequence<1>{}));
-
     return make_tuple(in_grid_desc_gemmk0_gemmm_gemmk1,
                       wei_grid_desc_gemmk0_gemmn_gemmk1,
                       out_grid_desc_gemmm_gemmn);
diff --git a/include/ck/tensor_description/tensor_space_filling_curve.hpp b/include/ck/tensor_description/tensor_space_filling_curve.hpp
index c399c874b8..0897768e2e 100644
--- a/include/ck/tensor_description/tensor_space_filling_curve.hpp
+++ b/include/ck/tensor_description/tensor_space_filling_curve.hpp
@@ -10,6 +10,9 @@
 #include "ck/utility/tuple_helper.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
 namespace ck {
 
 namespace detail {
@@ -21,6 +24,7 @@ struct IndexLookupTable
     MultiIndex<nDim> data[NumAccesses > 0 ? NumAccesses : 1];
 
     __host__ __device__ constexpr const MultiIndex<nDim>& operator[](index_t i) const
+        [[clang::lifetimebound]]
     {
         return data[i];
     }
@@ -191,3 +195,5 @@ struct SpaceFillingCurve
 };
 
 } // namespace ck
+
+#pragma clang diagnostic pop
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp
index 8df23454a2..41ca5916cb 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp
@@ -60,32 +60,6 @@ constexpr auto BlockGemmBlockScaleBPreshufflePipeline_Selector()
             NRepeat,
             KPack>{};
     }
-#if 0
-    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-    {
-        return BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v2<
-            BlkGemmPipeSche,
-            BlockSize,
-            ADataType,
-            BDataType,
-            ComputeDataType,
-            AccDataType,
-            ATileDesc,
-            BTileDesc,
-            AMmaTileDesc,
-            BMmaTileDesc,
-            ABlockTransferSrcScalarPerVector,
-            BBlockTransferSrcScalarPerVector,
-            MPerBlock,
-            NPerBlock,
-            KPerBlock,
-            MPerXDL,
-            NPerXDL,
-            MRepeat,
-            NRepeat,
-            KPack>{};
-    }
-#endif
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
index 199c729f53..96bf5e81b7 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
@@ -93,32 +93,6 @@ constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
                 KPack>{};
         }
     }
-#if 0
-    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-    {
-        return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v2<
-            BlkGemmPipeSche,
-            BlockSize,
-            ADataType,
-            BDataType,
-            ComputeDataType,
-            AccDataType,
-            ATileDesc,
-            BTileDesc,
-            AMmaTileDesc,
-            BMmaTileDesc,
-            ABlockTransferSrcScalarPerVector,
-            BBlockTransferSrcScalarPerVector,
-            MPerBlock,
-            NPerBlock,
-            KPerBlock,
-            MPerXDL,
-            NPerXDL,
-            MRepeat,
-            NRepeat,
-            KPack>{};
-    }
-#endif
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
index a31c9101a1..ade2839950 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
@@ -144,12 +144,6 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
                       "When loading more than one element per thread at once, the contiguous "
                       "dimension must be the same between source and destination.");
 
-        // constexpr auto dword_bytes           = 4;
-        // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
-        // static_assert(bytes_per_thread_load == dword_bytes,
-        //               "Direct load transfer requires each thread to load exactly a single "
-        //               "DWORD of data.");
-
         static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
                           nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
                           nDim == ThreadClusterLengths::Size(),
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
index 11043281ec..8c6e77bccd 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
@@ -152,12 +152,6 @@ struct ThreadGroupTensorSliceTransfer_Gather_DirectLoad
                       "When loading more than one element per thread at once, the contiguous "
                       "dimension must be the same between source and destination.");
 
-        // constexpr auto dword_bytes           = 4;
-        // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
-        // static_assert(bytes_per_thread_load == dword_bytes,
-        //               "Direct load transfer requires each thread to load exactly a single "
-        //               "DWORD of data.");
-
         static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
                           nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
                           nDim == ThreadClusterLengths::Size(),
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
index c64f2c42f3..69d8eef80a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
@@ -737,11 +737,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
 
         // Batch Offset
         ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
-
-        // for checking vector load/store
-        // index_t MRaw_;
-        // index_t NRaw_;
-        // index_t KRaw_;
     };
 
     // Invoker
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
index 6b595c4dce..043adf5fc0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
@@ -1433,147 +1433,6 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
         // TODO: properly implement this check
         return true;
     }
-#if 0
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        if(ck::is_gfx11_supported())
-        {
-            if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc0 Type err");
-                return false;
-            }
-
-            if constexpr(!(is_same_v<Acc1DataType, float> || is_same_v<Acc1DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc1 Type err");
-                return false;
-            }
-        }
-        else
-        {
-            printf("DeviceOp: Arch err");
-            return false;
-        }
-
-        if(!GridwiseOp::CheckValidity(arg.a_grid_desc,
-                                      arg.b0_grid_desc,
-                                      arg.b1_grid_desc,
-                                      arg.c_grid_desc_m_n_,
-                                      arg.block_2_ctile_map_))
-        {
-            return false;
-        }
-
-        // Check if C permute dimension matches GEMM + GEMM shape
-        const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded
-
-        if(!(c_g == arg.batch_count_))
-        {
-            printf("DeviceOp: BatchCount err");
-            return false;
-        }
-
-        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
-        // vector is out of bounds
-        // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O
-        const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0];
-        const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1];
-        const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2];
-        const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3];
-
-        // Check scalar per vector requirement
-        const auto a_extent_lowest  = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw;
-        const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw;
-        const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw;
-        const auto c_extent_lowest  = NzRaw;
-
-        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
-             b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
-             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
-             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
-        {
-            printf("DeviceOp: Data Transfer Vector scalar err");
-            return false;
-        }
-
-        // Check vector load/store requirement
-        const auto a_stride_lowest =
-            ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0];
-        const auto b0_stride_lowest =
-            B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0];
-        const auto b1_stride_lowest =
-            B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0];
-        const auto c_stride_lowest = arg.c_mz_nz_strides_[1];
-
-        if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
-             c_stride_lowest == 1))
-        {
-            printf("DeviceOp: Data Vectorize transfer err");
-            return false;
-        }
-
-        return true;
-    }
-
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-
-    static auto MakeArgument(
-        const ADataType* p_a,
-        const B0DataType* p_b0,
-        const B1DataType* p_b1,
-        CDataType* p_c,
-        const std::array<void*, NumAcc0Bias> p_acc0_biases,
-        const std::array<void*, NumAcc1Bias> p_acc1_biases,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_strides,
-        AElementwiseOperation a_element_op,
-        B0ElementwiseOperation b0_element_op,
-        AccElementwiseOperation acc_element_op,
-        B1ElementwiseOperation b1_element_op,
-        CElementwiseOperation c_element_op)
-    {
-        return Argument{p_a,
-                        p_b0,
-                        p_b1,
-                        p_c,
-                        p_acc0_biases,
-                        p_acc1_biases,
-                        a_gs_ms_ks_lengths,
-                        a_gs_ms_ks_strides,
-                        b0_gs_ls_ks_lengths,
-                        b0_gs_ls_ks_strides,
-                        b1_gs_ns_ls_lengths,
-                        b1_gs_ns_ls_strides,
-                        c_gs_ms_ns_lengths,
-                        c_gs_ms_ns_strides,
-                        acc0_biases_gs_ms_ls_lengths,
-                        acc0_biases_gs_ms_ls_strides,
-                        acc1_biases_gs_ms_ns_lengths,
-                        acc1_biases_gs_ms_ns_strides,
-                        1,
-                        1,
-                        a_element_op,
-                        b0_element_op,
-                        acc_element_op,
-                        b1_element_op,
-                        c_element_op};
-    }
-#endif
 
     // polymorphic
     std::unique_ptr<BaseArgument> MakeArgumentPointer(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index 6bac46b062..8e019c5530 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -809,7 +809,11 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
             << AK1 << ", "
             << BK1 << ", "
             << ABlockTransferSrcVectorDim << ", "
-            << BBlockTransferSrcVectorDim
+            << BBlockTransferSrcVectorDim << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << ABlockLdsExtraM << ", "
+            << BBlockLdsExtraN
             << ">";
         // clang-format on
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp
index bfb567d1e0..a3eab579e7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp
@@ -652,7 +652,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3
 
                 // Ensure that k_batch_ does not exceed the maximum value
                 // for the GEMM pipeline.
-                const auto k_batch_max = math::integer_divide_ceil((gemmK - 1), KPerBlock);
+                const auto k_batch_max = math::integer_divide_ceil(gemmK, KPerBlock);
                 k_batch_               = std::min(k_batch_, k_batch_max);
 
                 // Cap k_batch_ to 128 to avoid accuracy issues
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp
index 053f0eb3ae..87117be4ce 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp
@@ -596,7 +596,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3
 
                 // Ensure that k_batch_ does not exceed the maximum value
                 // for the GEMM pipeline.
-                const auto k_batch_max = math::integer_divide_ceil((gemmK - 1), KPerBlock);
+                const auto k_batch_max = math::integer_divide_ceil(gemmK, KPerBlock);
                 k_batch_               = std::min(k_batch_, k_batch_max);
 
                 if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index 99ec3387dc..0ee5ac3647 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -702,7 +702,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
 
                 // Ensure that k_batch_ does not exceed the maximum value
                 // for the GEMM pipeline.
-                const auto k_batch_max = static_cast<index_t>((gemmK - 1) / KPerBlock);
+                const auto k_batch_max = math::integer_divide_ceil(gemmK, KPerBlock);
                 k_batch_               = std::max(std::min(k_batch_, k_batch_max), 1);
 
                 if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp
index 2bce582f68..bfc88753a2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp
@@ -537,7 +537,7 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffleV3
 
                 // Ensure that k_batch_ does not exceed the maximum value
                 // for the GEMM pipeline.
-                const auto k_batch_max = math::integer_divide_ceil((gemmK - 1), KPerBlock);
+                const auto k_batch_max = math::integer_divide_ceil(gemmK, KPerBlock);
                 k_batch_               = std::min(k_batch_, k_batch_max);
 
                 // Cap k_batch_ to 128 to avoid accuracy issues
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index 2ab60581e7..dade0515af 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -607,7 +607,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
 
                 // Ensure that k_batch_ does not exceed the maximum value
                 // for the GEMM pipeline.
-                const auto k_batch_max = static_cast<index_t>((gemmK - 1) / K0PerBlock);
+                const auto k_batch_max = math::integer_divide_ceil(gemmK, K0PerBlock);
                 k_batch_               = std::max(std::min(k_batch_, k_batch_max), 1);
 
                 // Cap k_batch_ to 128 to avoid accuracy issues
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
index 6aa766ab5c..d1269c6d9a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
@@ -956,147 +956,6 @@ struct DeviceGroupedQueryAttentionForward_Wmma
         // TODO: properly implement this check
         return true;
     }
-#if 0
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        if(ck::is_gfx11_supported())
-        {
-            if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc0 Type err");
-                return false;
-            }
-
-            if constexpr(!(is_same_v<Acc1DataType, float> || is_same_v<Acc1DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc1 Type err");
-                return false;
-            }
-        }
-        else
-        {
-            printf("DeviceOp: Arch err");
-            return false;
-        }
-
-        if(!GridwiseOp::CheckValidity(arg.a_grid_desc,
-                                      arg.b0_grid_desc,
-                                      arg.b1_grid_desc,
-                                      arg.c_grid_desc_m_n_,
-                                      arg.block_2_ctile_map_))
-        {
-            return false;
-        }
-
-        // Check if C permute dimension matches GEMM + GEMM shape
-        const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded
-
-        if(!(c_g == arg.batch_count_))
-        {
-            printf("DeviceOp: BatchCount err");
-            return false;
-        }
-
-        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
-        // vector is out of bounds
-        // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O
-        const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0];
-        const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1];
-        const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2];
-        const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3];
-
-        // Check scalar per vector requirement
-        const auto a_extent_lowest  = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw;
-        const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw;
-        const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw;
-        const auto c_extent_lowest  = NzRaw;
-
-        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
-             b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
-             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
-             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
-        {
-            printf("DeviceOp: Data Transfer Vector scalar err");
-            return false;
-        }
-
-        // Check vector load/store requirement
-        const auto a_stride_lowest =
-            ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0];
-        const auto b0_stride_lowest =
-            B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0];
-        const auto b1_stride_lowest =
-            B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0];
-        const auto c_stride_lowest = arg.c_mz_nz_strides_[1];
-
-        if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
-             c_stride_lowest == 1))
-        {
-            printf("DeviceOp: Data Vectorize transfer err");
-            return false;
-        }
-
-        return true;
-    }
-
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-
-    static auto MakeArgument(
-        const ADataType* p_a,
-        const B0DataType* p_b0,
-        const B1DataType* p_b1,
-        CDataType* p_c,
-        const std::array<void*, NumAcc0Bias> p_acc0_biases,
-        const std::array<void*, NumAcc1Bias> p_acc1_biases,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_strides,
-        AElementwiseOperation a_element_op,
-        B0ElementwiseOperation b0_element_op,
-        AccElementwiseOperation acc_element_op,
-        B1ElementwiseOperation b1_element_op,
-        CElementwiseOperation c_element_op)
-    {
-        return Argument{p_a,
-                        p_b0,
-                        p_b1,
-                        p_c,
-                        p_acc0_biases,
-                        p_acc1_biases,
-                        a_gs_ms_ks_lengths,
-                        a_gs_ms_ks_strides,
-                        b0_gs_ls_ks_lengths,
-                        b0_gs_ls_ks_strides,
-                        b1_gs_ns_ls_lengths,
-                        b1_gs_ns_ls_strides,
-                        c_gs_ms_ns_lengths,
-                        c_gs_ms_ns_strides,
-                        acc0_biases_gs_ms_ls_lengths,
-                        acc0_biases_gs_ms_ls_strides,
-                        acc1_biases_gs_ms_ns_lengths,
-                        acc1_biases_gs_ms_ns_strides,
-                        1,
-                        1,
-                        a_element_op,
-                        b0_element_op,
-                        acc_element_op,
-                        b1_element_op,
-                        c_element_op};
-    }
-#endif
 
     // polymorphic
     std::unique_ptr<BaseArgument> MakeArgumentPointer(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
index a303b6f808..a9d916c6a0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
@@ -948,147 +948,6 @@ struct DeviceMultiQueryAttentionForward_Wmma
         // TODO: properly implement this check
         return true;
     }
-#if 0
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        if(ck::is_gfx11_supported())
-        {
-            if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc0 Type err");
-                return false;
-            }
-
-            if constexpr(!(is_same_v<Acc1DataType, float> || is_same_v<Acc1DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc1 Type err");
-                return false;
-            }
-        }
-        else
-        {
-            printf("DeviceOp: Arch err");
-            return false;
-        }
-
-        if(!GridwiseOp::CheckValidity(arg.a_grid_desc,
-                                      arg.b0_grid_desc,
-                                      arg.b1_grid_desc,
-                                      arg.c_grid_desc_m_n_,
-                                      arg.block_2_ctile_map_))
-        {
-            return false;
-        }
-
-        // Check if C permute dimension matches GEMM + GEMM shape
-        const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded
-
-        if(!(c_g == arg.batch_count_))
-        {
-            printf("DeviceOp: BatchCount err");
-            return false;
-        }
-
-        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
-        // vector is out of bounds
-        // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O
-        const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0];
-        const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1];
-        const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2];
-        const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3];
-
-        // Check scalar per vector requirement
-        const auto a_extent_lowest  = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw;
-        const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw;
-        const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw;
-        const auto c_extent_lowest  = NzRaw;
-
-        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
-             b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
-             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
-             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
-        {
-            printf("DeviceOp: Data Transfer Vector scalar err");
-            return false;
-        }
-
-        // Check vector load/store requirement
-        const auto a_stride_lowest =
-            ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0];
-        const auto b0_stride_lowest =
-            B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0];
-        const auto b1_stride_lowest =
-            B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0];
-        const auto c_stride_lowest = arg.c_mz_nz_strides_[1];
-
-        if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
-             c_stride_lowest == 1))
-        {
-            printf("DeviceOp: Data Vectorize transfer err");
-            return false;
-        }
-
-        return true;
-    }
-
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-
-    static auto MakeArgument(
-        const ADataType* p_a,
-        const B0DataType* p_b0,
-        const B1DataType* p_b1,
-        CDataType* p_c,
-        const std::array<void*, NumAcc0Bias> p_acc0_biases,
-        const std::array<void*, NumAcc1Bias> p_acc1_biases,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_strides,
-        AElementwiseOperation a_element_op,
-        B0ElementwiseOperation b0_element_op,
-        AccElementwiseOperation acc_element_op,
-        B1ElementwiseOperation b1_element_op,
-        CElementwiseOperation c_element_op)
-    {
-        return Argument{p_a,
-                        p_b0,
-                        p_b1,
-                        p_c,
-                        p_acc0_biases,
-                        p_acc1_biases,
-                        a_gs_ms_ks_lengths,
-                        a_gs_ms_ks_strides,
-                        b0_gs_ls_ks_lengths,
-                        b0_gs_ls_ks_strides,
-                        b1_gs_ns_ls_lengths,
-                        b1_gs_ns_ls_strides,
-                        c_gs_ms_ns_lengths,
-                        c_gs_ms_ns_strides,
-                        acc0_biases_gs_ms_ls_lengths,
-                        acc0_biases_gs_ms_ls_strides,
-                        acc1_biases_gs_ms_ns_lengths,
-                        acc1_biases_gs_ms_ns_strides,
-                        1,
-                        1,
-                        a_element_op,
-                        b0_element_op,
-                        acc_element_op,
-                        b1_element_op,
-                        c_element_op};
-    }
-#endif
 
     // polymorphic
     std::unique_ptr<BaseArgument> MakeArgumentPointer(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index d66679a318..76f0b5a893 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -464,12 +464,6 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
             return false;
         }
 
-        // check block-to-E-tile
-        // if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
-        //{
-        // return false;
-        //}
-
         // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
         // check tensor size: cannot be larger than 2GB each
         constexpr long_index_t TwoGB = (long_index_t{1} << 31);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index 4b679adc8d..2252ebf980 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -351,74 +351,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
         return a_grid_desc_ak0_m_ak1;
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both M and K
-            const auto a_grid_desc_m_k =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_right_pad_transform(M, MPad - M),
-                                                       make_right_pad_transform(K, KPad - K)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_pass_through_transform(MPad)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad M, but not K
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_right_pad_transform(M, MPad - M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad K, but not M
-            const auto a_grid_desc_m_k = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else
-        {
-            // not pad M or K
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-#endif
     }
 
     __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
@@ -451,74 +383,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
         return b_grid_desc_bk0_n_bk1;
-#if 0     
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both N and K
-            const auto b_grid_desc_n_k =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_right_pad_transform(N, NPad - N),
-                                                       make_right_pad_transform(K, KPad - K)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_n_k,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_pass_through_transform(NPad)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad N, but not K
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad K, but not N
-            const auto b_grid_desc_n_k = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_n_k,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else
-        {
-            // not pad N or K
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-#endif
     }
 
     template <typename ABlockDesc_AK0_M_AK1>
@@ -559,45 +423,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 5c5eb9405f..d926efab84 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -682,45 +682,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index 7f1a42fb26..a81679ea78 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -613,45 +613,6 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
index daa4fd2e8a..f9be9e494b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -568,45 +568,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
index f018730300..529248093b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
@@ -806,58 +806,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
         index_t b_k_split_offset;
     };
 
-#if 0
-    struct SplitKBatchOffsetMultiABD
-    {
-        __device__ SplitKBatchOffsetMultiABD(AsGridPointer& p_as_grid,
-                                             BsGridPointer& p_bs_grid,
-                                             Argument& karg)
-        {
-            static_for<0, NumATensor, 1>{}([&](auto i) {
-                using ALayout_ = remove_cvref_t<tuple_element_t<i.value, AsLayout>>;
-                if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout_>)
-                {
-                    as_k_split_offset[i] = blockIdx.z * karg.KRead;
-                }
-                else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout_>)
-                {
-                    as_k_split_offset[i] = blockIdx.z * karg.KRead * karg.StrideAs[i];
-                }
-
-                p_as_grid_(i) = p_as_grid[i] + as_k_split_offset[i];
-            });
-
-            static_for<0, NumBTensor, 1>{}([&](auto i) {
-                using BLayout_ = remove_cvref_t<tuple_element_t<i.value, BsLayout>>;
-                if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout_>)
-                {
-                    bs_k_split_offset[i] = blockIdx.z * karg.KRead * karg.StrideBs[i];
-                }
-                else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout_>)
-                {
-                    bs_k_split_offset[i] = blockIdx.z * karg.KRead;
-                }
-
-                p_bs_grid_(i) = p_bs_grid[i] + bs_k_split_offset[i];
-            });
-
-            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
-            {
-                karg.K = karg.KRead;
-            }
-            else
-            {
-                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
-            }
-        }
-
-        AsGridPointer p_as_grid_;
-        BsGridPointer p_bs_grid_;
-        std::array<index_t, NumATensor> as_k_split_offset;
-        std::array<index_t, NumBTensor> bs_k_split_offset;
-    };
-#endif
-
     using BlockwiseGemmPipe = remove_cvref_t<
         decltype(BlockGemmPipeline_Selector<
                  BlkGemmPipelineVer,
@@ -1129,10 +1077,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
         // BsGridPointer p_bs_grid;
         // DsGridPointer p_ds_grid;
 
-        // const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
-        //    problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
-        // const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
-        //    problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
         const auto as_grid_desc_ak0_m_ak1 = MakeAsGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideAs, problem.AK0);
         const auto bs_grid_desc_bk0_n_bk1 = MakeBsGridDescriptor_BK0_N_BK1(
@@ -1147,22 +1091,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
         const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
 
-#if 0
-        static_for<0, NumDTensor, 1>{}([&](auto j) {
-            ds_grid_desc_m_n(j) = MakeCGridDescriptor_M_N(
-                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs[j]);
-        });
-#endif
-
         const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
-        // const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-        //    p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        // const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-        //    p_bs_grid[I0], b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-
         const auto as_grid_buf = generate_tuple(
             [&](auto i) {
                 return make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -1406,10 +1338,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                     const BElementwiseOperation& b_element_op,
                                     const CElementwiseOperation& c_element_op)
     {
-        // const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
-        //    problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
-        // const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
-        //    problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
         const auto as_grid_desc_ak0_m_ak1 = MakeAsGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideAs, problem.AK0);
         const auto bs_grid_desc_bk0_n_bk1 = MakeBsGridDescriptor_BK0_N_BK1(
@@ -1428,10 +1356,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
             MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
-        // const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-        //    p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        // const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-        //    p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
         const auto as_grid_buf = generate_tuple(
             [&](auto i) {
                 return make_dynamic_buffer<AddressSpaceEnum::Global>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index a3dffed09d..671cfe4967 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -642,45 +642,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     __host__ __device__ static auto MakeDsGridDescriptor_M_N(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index 36895f55ea..54260d4386 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -558,45 +558,6 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     __host__ __device__ static auto MakeDsGridDescriptor_M_N(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index e810a467e7..28bcf14cd0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -609,45 +609,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     __host__ __device__ static auto MakeDsGridDescriptor_M_N(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index d2dd1d243c..fa231c9b02 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -669,45 +669,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
index 88f5dd44f3..43a46d6ff4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
@@ -696,45 +696,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
index 79e3a44660..d1d136bcc8 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
@@ -30,48 +30,6 @@ namespace ck {
 // 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
 // buffer when we declare __shared__ inside blkgemmpipe
 
-#if 0
-template <typename GridwiseGemm,
-          bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          index_t MinimumOccupancy = 1,
-          TailNumber TailNum       = TailNumber::Even>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
-#endif
-    // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
-{
-#if defined(__gfx9__)
-    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
-    {
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
-    }
-#else
-    ignore = karg;
-#endif // end of if (defined(__gfx9__))
-}
-#endif
-
 template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
@@ -1235,770 +1193,6 @@ struct GridwiseMoeGemmMX
                       is_same_v<BElementwiseOperation, tensor_operation::element_wise::PassThrough>,
                   "A/B ElementwiseOperation should be PassThrough as load_to_lds is used!");
 
-#if 0
-    template <bool HasMainKBlockLoop,
-              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run(const index_t* p_sorted_token_ids,
-                               const index_t* p_sorted_expert_ids,
-                               const index_t* p_max_token_id,
-                               const ADataType* p_a_grid,
-                               const AScaleDataType* p_a_scale_grid,
-                               const BDataType* p_b_grid,
-                               const BScaleDataType* p_b_scale_grid,
-                               DsGridPointer& p_ds_grid,
-                               CDataType* p_c_grid,
-                               void* p_shared,
-                               const Problem& problem,
-                               AElementwiseOperation a_element_op,
-                               BElementwiseOperation b_element_op,
-                               CElementwiseOperation c_element_op)
-    {
-        ignore                           = a_element_op;
-        ignore                           = b_element_op;
-        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
-            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
-            problem.MPadded,
-            problem.K,
-            problem.KPadded,
-            problem.StrideA,
-            problem.AK0);
-        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
-            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
-        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
-            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
-            problem.MPadded,
-            problem.N,
-            problem.NPadded,
-            problem.StrideC);
-
-        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
-            make_tuple(problem.M / (MXdlPack * MPerXdl),
-                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
-                           (KXdlPack * 64 / MPerXdl),
-                       64 * KXdlPack * MXdlPack / scale_pack_size_a));
-
-        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed(
-            make_tuple(problem.N / (NXdlPack * NPerXdl),
-                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
-                           (KXdlPack * 64 / NPerXdl),
-                       64 * KXdlPack * NXdlPack / scale_pack_size_b));
-
-        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
-            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-
-        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
-        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
-        if(expert_block_id * MPerBlock >= max_token_id)
-            return;
-        const index_t expert_id =
-            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
-
-        const auto block_mn = [&]() -> std::pair<int, int> {
-            if constexpr(NSwizzle)
-            {
-                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
-                const index_t prefix_block = ecnt_prefix * problem.NBlock;
-                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
-                const index_t expert_swizzle =
-                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
-                const index_t bid_new = blockIdx.x - prefix_block;
-                const index_t nid     = __builtin_amdgcn_readfirstlane(
-                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
-                const index_t mid =
-                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
-                return {nid, mid};
-            }
-            else
-            {
-                return {blockIdx.x, blockIdx.y};
-            }
-        }();
-
-        const index_t block_n_id = block_mn.first;
-        const index_t block_m_id = block_mn.second;
-        const index_t token0 =
-            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
-
-        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
-        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
-        constexpr auto AKThreads  = AK0Threads * AK1Threads;
-        constexpr auto AMRepeats  = MPerBlock / AMThreads;
-        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
-
-        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
-            return;
-        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
-        static_for<0, AMRepeats, 1>{}([&](auto m0) {
-            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
-            index_t token_offset      = fused_token & 0xffffff;
-            if constexpr(!IsInputGemm)
-            {
-                token_offset = token_offset * problem.TopK + (fused_token >> 24);
-            }
-            gather_offsets(m0) = static_cast<IndexType>(token_offset);
-        });
-
-        const long_index_t expert_stride =
-            __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(problem.N) * problem.K * (IsInputGemm ? 2 : 1));
-        const long_index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
-            static_cast<long_index_t>(problem.N) * (IsInputGemm ? 2 : 1) *
-            math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
-
-        // N0, K0, Blocksize*KPack
-        const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
-
-        // Gride buffer creation
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid + static_cast<long_index_t>(expert_id) * expert_stride, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-
-        // A, B scale buffer
-        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
-        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_scale_grid + (static_cast<long_index_t>(expert_id) * expert_scale_stride) / sizeof(BScaleDataType),
-            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
-
-        // lds max alignment
-        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
-
-        // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
-
-        // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-
-        // A matrix blockwise direct to LDS copy
-        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_Gather_DirectLoad<
-            ThisThreadBlock,
-            Sequence<AK0Number, MPerBlock, AK1Number>,
-            ABlockTransferThreadClusterLengths_AK0_M_AK1,
-            ABlockTransferThreadClusterArrangeOrder,
-            ADataType,
-            ADataType,
-            decltype(a_grid_desc_ak0_m_ak1),
-            decltype(a_block_desc_ak0_m_ak1),
-            ABlockTransferSrcAccessOrder,
-            ABlockTransferSrcVectorDim,
-            2,
-            ABlockTransferSrcScalarPerVector,
-            IndexType,
-            1>(a_grid_desc_ak0_m_ak1,
-               make_multi_index(0, 0, 0),
-               a_block_desc_ak0_m_ak1,
-               make_multi_index(0, 0, 0),
-               gather_offsets);
-
-        // B matrix blockwise copy
-        auto b_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
-                                                      Sequence<BK0Number, NPerBlock, BK1Number>,
-                                                      BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                                                      BBlockTransferThreadClusterArrangeOrder,
-                                                      BDataType,
-                                                      BDataType,
-                                                      decltype(b_grid_desc_bk0_n_bk1),
-                                                      decltype(b_block_desc_bk0_n_bk1),
-                                                      BBlockTransferSrcAccessOrder,
-                                                      BBlockTransferSrcVectorDim,
-                                                      2,
-                                                      BBlockTransferSrcScalarPerVector>(
-                b_grid_desc_bk0_n_bk1,
-                make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_block_desc_bk0_n_bk1,
-                make_multi_index(0, 0, 0));
-
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
-            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
-
-        // Cast after lds
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
-
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
-                                         a_block_space_size_aligned * sizeof(ADataType)),
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
-
-        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
-
-        // Blockwise GEMM pipeline
-        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
-        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
-        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
-        decltype(c_thread_buf) c_thread_buf_up;
-
-        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
-                                  float,
-                                  c_thread_buf.num_of_v_,
-                                  c_thread_buf.s_per_v,
-                                  true>
-            c_thread_buf_fp32;
-
-        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
-            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
-            KPerBlock);
-
-        // a and b scale processing
-        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
-        const auto waveId_m = wave_idx[I0];
-        const auto waveId_n = wave_idx[I1];
-
-        auto thread_offset_shuffled =
-            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
-
-        auto a_thread_offset_m = waveId_m;
-
-        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
-            AScaleDataType,
-            AScaleDataType,
-            decltype(a_scale_grid_desc_am_ak),
-            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
-            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
-            Sequence<0, 1, 2>,                                       // DimAccessOrder
-            2,                                                       // SrcVectorDim
-            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
-            1,                                                       // SrcScalarStrideInVector
-            true>(a_scale_grid_desc_am_ak,
-                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
-                                   0,
-                                   thread_offset_shuffled / scale_pack_size_a));
-
-        // B scale load
-        auto b_thread_offset_n = waveId_n;
-
-        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
-            BScaleDataType,
-            BScaleDataType,
-            decltype(b_scale_grid_desc_bn_ak),
-            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
-            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
-            Sequence<0, 1, 2>,                                       // DimAccessOrder
-            2,                                                       // SrcVectorDim
-            KXdlPack * NXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
-            1,                                                       // SrcScalarStrideInVector
-            true>(b_scale_grid_desc_bn_ak,
-                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
-                                   0,
-                                   thread_offset_shuffled / scale_pack_size_b));
-
-        if constexpr(IsInputGemm)
-        {
-            constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
-                b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
-            auto b_block_buf_up = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-                reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
-                                             a_block_space_size_aligned * sizeof(ADataType) +
-                                             b_block_space_size_aligned * sizeof(BDataType)),
-                b_block_desc_bk0_n_bk1.GetElementSpaceSize());
-
-            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2;
-            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_grid_up + static_cast<long_index_t>(expert_id) * expert_stride,
-                b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-
-            auto b_blockwise_copy_up = ThreadGroupTensorSliceTransfer_DirectLoad<
-                ThisThreadBlock,
-                Sequence<BK0Number, NPerBlock, BK1Number>,
-                BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                BBlockTransferThreadClusterArrangeOrder,
-                BDataType,
-                BDataType,
-                decltype(b_grid_desc_bk0_n_bk1),
-                decltype(b_block_desc_bk0_n_bk1),
-                BBlockTransferSrcAccessOrder,
-                BBlockTransferSrcVectorDim,
-                2,
-                BBlockTransferSrcScalarPerVector>(b_grid_desc_bk0_n_bk1,
-                                                  make_multi_index(0, n_block_data_idx_on_grid, 0),
-                                                  b_block_desc_bk0_n_bk1,
-                                                  make_multi_index(0, 0, 0));
-
-            const BScaleDataType* p_b_scale_grid_up =
-                p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType);
-            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_scale_grid_up + static_cast<long_index_t>(expert_id) * expert_scale_stride / sizeof(BScaleDataType),
-                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
-
-            auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
-                BScaleDataType,
-                BScaleDataType,
-                decltype(b_scale_grid_desc_bn_ak),
-                decltype(BlockwiseGemmPipe::b_scale_thread_desc),
-                Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
-                Sequence<0, 1, 2>,                                       // DimAccessOrder
-                2,                                                       // SrcVectorDim
-                KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
-                1,                                                       // SrcScalarStrideInVector
-                true>(
-                b_scale_grid_desc_bn_ak,
-                make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
-                                 0,
-                                 thread_offset_shuffled / scale_pack_size_b));
-
-            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
-                // A
-                a_grid_desc_ak0_m_ak1,
-                a_block_desc_ak0_m_ak1,
-                a_blockwise_copy,
-                a_grid_buf,
-                a_block_buf,
-                a_block_slice_copy_step,
-                // Gate and Up
-                b_grid_desc_bk0_n_bk1,
-                b_block_desc_bk0_n_bk1,
-                b_blockwise_copy,
-                b_blockwise_copy_up,
-                b_grid_buf,
-                b_grid_buf_up,
-                b_block_buf,
-                b_block_buf_up,
-                b_block_slice_copy_step,
-                // C
-                c_thread_buf,
-                c_thread_buf_up,
-                // A scale
-                a_scale_grid_desc_am_ak,
-                a_scale_thread_copy,
-                a_scale_grid_buf,
-                // Gate and Up scale
-                b_scale_grid_desc_bn_ak,
-                b_scale_thread_copy,
-                b_scale_thread_copy_up,
-                b_scale_grid_buf,
-                b_scale_grid_buf_up,
-                num_k_block_main_loop);
-        }
-        else
-        {
-            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
-                a_grid_desc_ak0_m_ak1, // A
-                a_block_desc_ak0_m_ak1,
-                a_blockwise_copy,
-                a_grid_buf,
-                a_block_buf,
-                a_block_slice_copy_step,
-                b_grid_desc_bk0_n_bk1, // B
-                b_block_desc_bk0_n_bk1,
-                b_blockwise_copy,
-                b_grid_buf,
-                b_block_buf,
-                b_block_slice_copy_step,
-                c_thread_buf,            // C
-                a_scale_grid_desc_am_ak, // A scale
-                a_scale_thread_copy,
-                a_scale_grid_buf,
-                b_scale_grid_desc_bn_ak, // B scale
-                b_scale_thread_copy,
-                b_scale_grid_buf,
-                num_k_block_main_loop);
-        }
-
-        // shuffle C and write out
-        {
-            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
-                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
-                          "wrong!");
-            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
-                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
-                          "wrong!");
-
-            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
-            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
-
-            // TODO: hacky, fix it!
-            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
-                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
-
-            // TODO: hacky, fix it!
-            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
-            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
-                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
-
-            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
-            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
-            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
-            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
-            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
-            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
-            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
-            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
-            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
-            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
-
-            // mul scales
-            static_assert(M0 * M1 * M2 * M3 * M4 * M5 == MPerBlock);
-            static_assert(M5 == 4);
-            const index_t m1 = get_warp_local_1d_id() / NWave; // Mwave id
-            const index_t m4 = threadIdx.x % get_warp_size() / MPerXdl;
-
-            vector_type<float, 4> topk_weights; // for gemm2 only
-            static_for<0, NXdlPerWave / NXdlPack, 1>{}([&](auto n0) {
-                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {                // NXdlPack
-                    static_for<0, MXdlPerWave / MXdlPack, 1>{}([&](auto m0) { // MXDLPerWave
-                        static_for<0, MXdlPack, 1>{}([&](auto imxdl) {        // MXdlPack
-                            static_for<0, M3, 1>{}([&](auto m3) { // m_inst_num_groups_per_blk
-                                const index_t m_pos = block_m_id * MPerBlock +
-                                                      m0 * M2 * M1 * M3 * M4 * M5 +
-                                                      m1 * M2 * M3 * M4 * M5 +
-                                                      imxdl * M3 * M4 * M5 + m3 * M4 * M5 + m4 * M5;
-                                if constexpr(MulRoutedWeight)
-                                {
-                                    topk_weights =
-                                        *c_style_pointer_cast<const vector_type<float, M5>*>(
-                                            p_ds_grid[I2] + m_pos);
-                                }
-                                static_for<0, M5, 1>{}([&](auto m5) { // m_inst_group_size
-                                    constexpr index_t c_offset =
-                                        blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
-                                            make_tuple(m0, n0, imxdl, inxdl, m3 * M5 + m5));
-                                    constexpr auto cidx = Number<c_offset>{};
-
-                                    if constexpr(IsInputGemm) // gu fusion
-                                    {
-                                        if constexpr(ActivationOperation ==
-                                                     Activation::silu_and_mul)
-                                        {
-                                            float gate = c_thread_buf[cidx];
-                                            float up   = c_thread_buf_up[cidx];
-                                            if constexpr(MulRoutedWeight)
-                                            {
-                                                gate = gate * topk_weights.AsType<float>()[m5];
-                                                up   = up * topk_weights.AsType<float>()[m5];
-                                            }
-                                            tensor_operation::element_wise::Silu{}(gate, gate);
-                                            c_thread_buf_fp32(cidx) = gate * up;
-                                        }
-                                        else if(ActivationOperation == Activation::gelu_and_mul)
-                                        {
-                                            float gate = c_thread_buf[cidx];
-                                            float up   = c_thread_buf_up[cidx];
-                                            if constexpr(MulRoutedWeight)
-                                            {
-                                                gate = gate * topk_weights.AsType<float>()[m5];
-                                                up   = up * topk_weights.AsType<float>()[m5];
-                                            }
-                                            tensor_operation::element_wise::Gelu{}(gate, gate);
-                                            c_thread_buf_fp32(cidx) = gate * up;
-
-                                            /*float gate = c_thread_buf[cidx];
-                                            float up   = c_thread_buf_up[cidx];
-                                            if constexpr(MulRoutedWeight)
-                                            {
-                                                gate = gate * topk_weights.AsType<float>()[m5];
-                                                //up   = up * topk_weights.AsType<float>()[m5];
-                                            }
-                                            tensor_operation::element_wise::Gelu{}(gate, gate);
-                                            c_thread_buf_fp32(cidx) = up;*/
-                                        }
-                                    }
-                                    else
-                                    {
-                                        c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
-                                        if constexpr(MulRoutedWeight)
-                                        {
-                                            c_thread_buf_fp32(cidx) =
-                                                topk_weights.AsType<float>()[m5] *
-                                                c_thread_buf_fp32[cidx];
-                                        }
-                                    }
-                                });
-                            });
-                        });
-                    });
-                });
-            });
-
-            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
-                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
-
-            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-                static_cast<CShuffleDataType*>(p_shared),
-                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-
-            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
-                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
-                make_tuple(
-                    make_freeze_transform(I0),
-                    make_unmerge_transform(make_tuple(
-                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave)
-                                                                            // per shuffle
-                        M1,                                                 // M1 = MWave
-                        M2,                                                 // M2 = MXdlPack
-                        M3, // M3 * M4 * M5 = MPerXdl
-                        M4,
-                        M5)),
-                    make_freeze_transform(I0),
-                    make_unmerge_transform(make_tuple(
-                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave)
-                                                                            // per shuffle
-                        N1,                                                 // N1 = NWave
-                        N2,                                                 // N2 = NXdlPack
-                        N3))),                                              // N3 = NPerXdl
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<>{},
-                           Sequence<0, 2, 4, 6, 7, 8>{},
-                           Sequence<>{},
-                           Sequence<1, 3, 5, 9>{}));
-
-            // calculate origin of thread output tensor on global memory
-            //     blockwise GEMM c matrix starting index
-            const auto c_thread_mtx_on_block =
-                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
-
-            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
-            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
-
-            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
-                make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
-                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
-                    make_tuple(Sequence<0>{}));
-
-            const auto m_thread_data_on_block_idx =
-                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
-                    make_multi_index(m_thread_data_on_block));
-
-            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
-                make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
-                    make_tuple(Sequence<0, 1, 2, 3>{}),
-                    make_tuple(Sequence<0>{}));
-
-            const auto n_thread_data_on_block_idx =
-                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
-                    make_multi_index(n_thread_data_on_block));
-
-            // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
-                AccDataType,
-                CShuffleDataType,
-                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                ck::tensor_operation::element_wise::PassThrough,
-                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
-                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
-                         I1,
-                         I1,
-                         M2,
-                         N2,
-                         M3,
-                         I1,
-                         M5,
-                         I1>,
-                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
-                9,
-                1,
-                InMemoryDataOperationEnum::Set,
-                1,
-                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                      make_multi_index(0,
-                                       0,
-                                       m_thread_data_on_block_idx[I1],
-                                       n_thread_data_on_block_idx[I1],
-                                       m_thread_data_on_block_idx[I2],
-                                       n_thread_data_on_block_idx[I2],
-                                       m_thread_data_on_block_idx[I3],
-                                       m_thread_data_on_block_idx[I4],
-                                       m_thread_data_on_block_idx[I5],
-                                       n_thread_data_on_block_idx[I3]),
-                      ck::tensor_operation::element_wise::PassThrough{}};
-
-            using EDataType = CDataType;
-
-            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
-                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
-
-            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
-                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
-
-            const auto ds_grid_buf = generate_tuple(
-                [&](auto i) {
-                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
-                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
-                },
-                Number<NumDTensor>{});
-
-            // tuple of reference to C/Ds tensor descriptors
-            const auto c_ds_desc_refs = concat_tuple_of_reference(
-                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie([&](auto i) -> const auto& // return type should be reference
-                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                             Number<NumDTensor>{}));
-
-            // tuple of reference to C/Ds tensor descriptors
-            const auto c_ds_buf_refs = concat_tuple_of_reference(
-                tie(c_shuffle_block_buf),
-                generate_tie([&](auto i) -> const auto& // return type should be reference
-                             { return ds_grid_buf[i]; },
-                             Number<NumDTensor>{}));
-
-            // tuple of starting index of C/Ds blockwise copy
-            const auto idx_c_ds_block_begin =
-                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
-                                 generate_tuple(
-                                     [&](auto) {
-                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
-                                         // return make_multi_index(block_work_idx[I0], 0,
-                                         // block_work_idx[I1], 0);
-                                     },
-                                     Number<NumDTensor>{}));
-
-            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
-                c_grid_desc_mblock_mperblock_nblock_nperblock;
-
-            using CDEBlockTransferCluster =
-                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
-            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
-            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                   ThisThreadBlock,
-                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                   Tuple<EDataType>,
-                   decltype(c_ds_desc_refs),
-                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                   CElementwiseOperation,
-                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
-                                                                               // Sequence support
-                                                                               // arbitray type
-                   Sequence<1,
-                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                            1,
-                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                   CDEBlockTransferCluster,
-                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                   3,                    // index_t SrcVectorDim,
-                   3,                    // index_t DstVectorDim,
-                   CDEShuffleBlockTransferScalarPerVectors,
-                   CShuffleBlockTransferScalarPerVector_NPerBlock,
-                   sequence_merge_t<
-                       Sequence<true>,
-                       uniform_sequence_gen_t<NumDTensor,
-                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                   IndexType,
-                   1,                 // ScatterDim
-                   true,              // OutputScatter: false, only use scatter weights
-                   scatter_weight_idx // ScatterWeightIdx: ascale
-                   >{c_ds_desc_refs,
-                     idx_c_ds_block_begin,
-                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                     c_element_op};
-
-            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-
-            constexpr auto sfc_c_vgpr =
-                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
-                                           NXdlPerWave / NXdlPack,
-                                           1,
-                                           1,
-                                           MXdlPack,
-                                           NXdlPack,
-                                           M2,
-                                           1,
-                                           M4,
-                                           1>,
-                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
-                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
-                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
-                                           1,
-                                           1,
-                                           MXdlPack,
-                                           NXdlPack,
-                                           M2,
-                                           1,
-                                           M4,
-                                           1>>{};
-
-            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
-
-            // space filling curve for shuffled blockwise C/D/E
-            constexpr auto sfc_cde_block =
-                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
-                                  Sequence<0, 2, 1, 3>,
-                                  Sequence<1,
-                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                                           1,
-                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
-
-            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
-            constexpr auto EMThreads =
-                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
-            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
-            constexpr auto ENThreads =
-                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
-            static_for<0, num_access, 1>{}([&](auto access_id) {
-                // make sure it's safe to write to LDS
-                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
-
-                auto dstidx = sfc_cde_block.GetIndex(access_id);
-                const index_t c_token_pos =
-                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
-                static_for<0, EMRepeats, 1>{}([&](auto m0) {
-                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
-                    IndexType token_offset    = fused_token & 0xffffff;
-                    if constexpr(IsInputGemm)
-                    {
-                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
-                    }
-                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
-                });
-
-                block_sync_lds();
-
-                // each thread write its data from VGPR to LDS
-                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
-                                              c_thread_buf_fp32,
-                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                              c_shuffle_block_buf);
-
-                // make sure it's safe to read from LDS
-                block_sync_lds();
-
-                // each block copy its data from LDS to global
-                cde_block_copy_lds_and_global.Run(
-                    c_ds_desc_refs,
-                    c_ds_buf_refs,
-                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                    tie(c_grid_buf),
-                    scatter_offsets);
-
-                if constexpr(access_id < num_access - 1)
-                {
-                    constexpr auto cde_lds_and_global_step =
-                        sfc_cde_block.GetForwardStep(access_id);
-
-                    // move on Ds
-                    static_for<0, NumDTensor, 1>{}([&](auto i) {
-                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
-                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
-                    });
-
-                    // move on E
-                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
-                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                        I0,
-                        cde_lds_and_global_step);
-                }
-            });
-        }
-    }
-#endif
-
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
index 8559b78fe0..d428cb5e99 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
@@ -70,50 +70,6 @@ __launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
 #endif // end of if (defined(__gfx9__))
 }
 
-#if 0
-template <typename GridwiseGemm,
-          bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          index_t MinimumOccupancy = 1,
-          TailNumber TailNum       = TailNumber::Even>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
-#endif
-    // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
-{
-#if defined(__gfx9__)
-    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
-    {
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    // auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid,
-        karg.p_a_scale_grid,
-        karg.p_b_grid,
-        karg.p_b_scale_grid,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        p_shared1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
-    }
-#else
-    ignore = karg;
-#endif // end of if (defined(__gfx9__))
-}
-#endif
-
 template <typename ALayout,
           typename BLayout,
           typename DsLayout,
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index 1b60e6cb53..09cf0ec87b 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -303,14 +303,10 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
 {
     // Absolute fixing property
     // * Data Pixel
-    static constexpr index_t m_per_wmma = 16;
-    static constexpr index_t n_per_wmma = 16;
-    static constexpr index_t k_per_wmma = 16;
-    static constexpr index_t k_per_blk  = 8;
-    // static constexpr index_t src_a_data_size = 2;
-    // static constexpr index_t src_b_data_size = 2;
-    // static constexpr index_t acc_data_size   = 4;
-    // * Thread mapping inside wave, num_thread_per_subgroups always alone N direction
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t acc_data_size            = 4;
     static constexpr index_t acc_pack_number          = 1;
     static constexpr index_t num_thread_per_subgroups = n_per_wmma;
diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp
index 9fb81012ea..104e51de0c 100644
--- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp
@@ -20,11 +20,6 @@ __host__ __device__ static auto
 MakeGridDescriptorPair(const std::array<index_t, NumDimG + NumDimM + NumDimN>& gs_ms_ns_lengths_vec,
                        const std::array<index_t, NumDimG + NumDimM + NumDimN>& gs_ms_ns_strides_vec)
 {
-    // if(!(gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
-    //      gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN))
-    // {
-    //     throw std::runtime_error("wrong! dimension must match input lengths");
-    // }
 
     const auto to_tuple = [&](auto& vec, auto start, auto end) {
         return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
diff --git a/include/ck/utility/array.hpp b/include/ck/utility/array.hpp
index 73eb18fe16..e194267efa 100644
--- a/include/ck/utility/array.hpp
+++ b/include/ck/utility/array.hpp
@@ -9,6 +9,9 @@
 #include <type_traits>
 #include <cassert>
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
 namespace ck {
 
 template <typename TData, index_t NSize>
@@ -21,13 +24,22 @@ struct Array
 
     __host__ __device__ static constexpr index_t Size() { return NSize; }
 
-    __host__ __device__ constexpr const TData& At(index_t i) const { return mData[i]; }
+    __host__ __device__ constexpr const TData& At(index_t i) const [[clang::lifetimebound]]
+    {
+        return mData[i];
+    }
 
-    __host__ __device__ constexpr TData& At(index_t i) { return mData[i]; }
+    __host__ __device__ constexpr TData& At(index_t i) [[clang::lifetimebound]] { return mData[i]; }
 
-    __host__ __device__ constexpr const TData& operator[](index_t i) const { return At(i); }
+    __host__ __device__ constexpr const TData& operator[](index_t i) const [[clang::lifetimebound]]
+    {
+        return At(i);
+    }
 
-    __host__ __device__ constexpr TData& operator()(index_t i) { return At(i); }
+    __host__ __device__ constexpr TData& operator()(index_t i) [[clang::lifetimebound]]
+    {
+        return At(i);
+    }
 
     template <typename... Args>
     __host__ constexpr auto Emplace(index_t i, Args&&... args)
@@ -47,10 +59,16 @@ struct Array
 
         return *this;
     }
-    __host__ __device__ constexpr const TData* begin() const { return &mData[0]; }
-    __host__ __device__ constexpr const TData* end() const { return &mData[NSize]; }
-    __host__ __device__ constexpr TData* begin() { return &mData[0]; }
-    __host__ __device__ constexpr TData* end() { return &mData[NSize]; }
+    __host__ __device__ constexpr const TData* begin() const [[clang::lifetimebound]]
+    {
+        return &mData[0];
+    }
+    __host__ __device__ constexpr const TData* end() const [[clang::lifetimebound]]
+    {
+        return &mData[NSize];
+    }
+    __host__ __device__ constexpr TData* begin() [[clang::lifetimebound]] { return &mData[0]; }
+    __host__ __device__ constexpr TData* end() [[clang::lifetimebound]] { return &mData[NSize]; }
 };
 
 // empty Array
@@ -78,4 +96,7 @@ __host__ __device__ constexpr auto make_array()
 }
 
 } // namespace ck
+
+#pragma clang diagnostic pop
+
 #endif
diff --git a/include/ck/utility/container_element_picker.hpp b/include/ck/utility/container_element_picker.hpp
index 9de2466e71..cec6c85298 100644
--- a/include/ck/utility/container_element_picker.hpp
+++ b/include/ck/utility/container_element_picker.hpp
@@ -15,9 +15,6 @@ template <typename Arr, typename Picks>
 struct ContainerElementPicker
 {
     using type = ContainerElementPicker;
-#if 0
-    using data_type = typename Arr::data_type;
-#endif
 
     __host__ __device__ constexpr ContainerElementPicker() = delete;
 
@@ -81,9 +78,6 @@ template <typename Arr, typename Picks>
 struct ConstantContainerElementPicker
 {
     using type = ConstantContainerElementPicker;
-#if 0
-    using data_type = typename Arr::data_type;
-#endif
 
     __host__ __device__ constexpr ConstantContainerElementPicker() = delete;
 
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 00fab270e8..ce4c92425e 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -361,14 +361,8 @@ struct DynamicBuffer
         {
             if(is_valid_element)
             {
-#if 0
-                X tmp = x;
-
-                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
-#else
                 // if(i >= 2169041600)
                 *c_style_pointer_cast<X*>(&p_data_[i]) = x;
-#endif
             }
         }
     }
diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp
index eb4373447c..d6d4b83f81 100644
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -12,6 +12,9 @@
 #include "ck/utility/functional.hpp"
 #include "ck/utility/math.hpp"
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
 namespace ck {
 
 template <index_t, index_t, index_t>
@@ -400,8 +403,15 @@ struct index_array
 {
     index_t data[N > 0 ? N : 1];
 
-    __host__ __device__ constexpr index_t& operator[](index_t i) { return data[i]; }
-    __host__ __device__ constexpr const index_t& operator[](index_t i) const { return data[i]; }
+    __host__ __device__ constexpr index_t& operator[](index_t i) [[clang::lifetimebound]]
+    {
+        return data[i];
+    }
+    __host__ __device__ constexpr const index_t& operator[](index_t i) const
+        [[clang::lifetimebound]]
+    {
+        return data[i];
+    }
 };
 
 /**
@@ -1016,6 +1026,8 @@ using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;
 
 } // namespace ck
 
+#pragma clang diagnostic pop
+
 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <ck::index_t... Is>
 std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
diff --git a/include/ck/utility/statically_indexed_array.hpp b/include/ck/utility/statically_indexed_array.hpp
index e953735385..1139091233 100644
--- a/include/ck/utility/statically_indexed_array.hpp
+++ b/include/ck/utility/statically_indexed_array.hpp
@@ -7,6 +7,9 @@
 #include "functional2.hpp"
 #include "tuple.hpp"
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
 namespace ck {
 
 namespace detail {
@@ -66,7 +69,7 @@ struct StaticallyIndexedArray_v2
 
     // read access
     template <index_t I>
-    __host__ __device__ constexpr const auto& At(Number<I>) const
+    __host__ __device__ constexpr const auto& At(Number<I>) const [[clang::lifetimebound]]
     {
         static_assert(I < N, "wrong! out of range");
 
@@ -75,7 +78,7 @@ struct StaticallyIndexedArray_v2
 
     // write access
     template <index_t I>
-    __host__ __device__ constexpr auto& At(Number<I>)
+    __host__ __device__ constexpr auto& At(Number<I>) [[clang::lifetimebound]]
     {
         static_assert(I < N, "wrong! out of range");
 
@@ -84,14 +87,14 @@ struct StaticallyIndexedArray_v2
 
     // read access
     template <index_t I>
-    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const [[clang::lifetimebound]]
     {
         return At(i);
     }
 
     // write access
     template <index_t I>
-    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    __host__ __device__ constexpr auto& operator()(Number<I> i) [[clang::lifetimebound]]
     {
         return At(i);
     }
@@ -102,4 +105,7 @@ struct StaticallyIndexedArray_v2
 };
 
 } // namespace ck
+
+#pragma clang diagnostic pop
+
 #endif
diff --git a/include/ck/utility/transpose_vectors.hpp b/include/ck/utility/transpose_vectors.hpp
index de20674ef2..11b503da69 100644
--- a/include/ck/utility/transpose_vectors.hpp
+++ b/include/ck/utility/transpose_vectors.hpp
@@ -18,22 +18,6 @@ struct transpose_vectors;
 // transpose fp16 2x2
 __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1)
 {
-#if 0
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-
-    const vector_type<half_t, 2> vx0{x0}, vx1{x1};
-    vector_type<half_t, 2> vy0, vy1;
-
-    vy0.template AsType<half_t>()(I0) = vx0.template AsType<half_t>()[I0];
-    vy0.template AsType<half_t>()(I1) = vx1.template AsType<half_t>()[I0];
-
-    vy1.template AsType<half_t>()(I0) = vx0.template AsType<half_t>()[I1];
-    vy1.template AsType<half_t>()(I1) = vx1.template AsType<half_t>()[I1];
-
-    y0 = vy0.template AsType<half2_t>()[I0];
-    y1 = vy1.template AsType<half2_t>()[I0];
-#else
     constexpr int32_t m0 = 0x05040100;
     constexpr int32_t m1 = 0x07060302;
 
@@ -43,7 +27,6 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
     // index is reversed because of little endianness (least significant bits first)
     y0 = bit_cast<half2_t>(__builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m0));
     y1 = bit_cast<half2_t>(__builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m1));
-#endif
 }
 
 template <index_t NX, index_t NY>
diff --git a/include/ck/utility/workgroup_barrier.hpp b/include/ck/utility/workgroup_barrier.hpp
index 0e440799be..0be341da88 100644
--- a/include/ck/utility/workgroup_barrier.hpp
+++ b/include/ck/utility/workgroup_barrier.hpp
@@ -12,20 +12,6 @@ struct workgroup_barrier
 
     __device__ uint32_t ld(uint32_t offset)
     {
-#if 0
-        float d = llvm_amdgcn_raw_buffer_load_fp32(
-                        amdgcn_make_buffer_resource(base_ptr),
-                        0,
-                        offset,
-                        AMDGCN_BUFFER_GLC);
-        union cvt {
-            float f32;
-            uint32_t u32;
-        };
-        cvt x;
-        x.f32 = d;
-        return x.u32;
-#endif
         return __atomic_load_n(base_ptr + offset, __ATOMIC_RELAXED);
     }
 
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index ed063b3abb..e558502563 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -55,6 +55,7 @@
 #include "ck_tile/core/container/tuple.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
 #include "ck_tile/core/numeric/e8m0.hpp"
+#include "ck_tile/core/numeric/ext_vector_base.hpp"
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/int8.hpp"
diff --git a/include/ck_tile/core/algorithm/coordinate_transform.hpp b/include/ck_tile/core/algorithm/coordinate_transform.hpp
index 30c93b8f00..af43cd3399 100644
--- a/include/ck_tile/core/algorithm/coordinate_transform.hpp
+++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp
@@ -1298,7 +1298,7 @@ CK_TILE_HOST_DEVICE static void print(const modulo<Modulus, UpLength>& m)
 }
 
 // 2D XOR, NOTE: "xor" is a keyword
-template <typename LowLengths>
+template <typename LowLengths, bool ApplyModulo = true>
 struct xor_t : public base_transform<2, 2>
 {
     static constexpr auto type_enum = coord_transform_enum::xor_t;
@@ -1330,8 +1330,15 @@ struct xor_t : public base_transform<2, 2>
 
         idx_low(number<0>{}) = idx_up[number<0>{}];
 
-        idx_low(number<1>{}) =
-            idx_up[number<1>{}] ^ (idx_up[number<0>{}] % up_lengths_[number<1>{}]);
+        if constexpr(ApplyModulo)
+        {
+            idx_low(number<1>{}) =
+                idx_up[number<1>{}] ^ (idx_up[number<0>{}] % up_lengths_[number<1>{}]);
+        }
+        else
+        {
+            idx_low(number<1>{}) = idx_up[number<1>{}] ^ (idx_up[number<0>{}]);
+        }
     }
 
     template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
@@ -1382,8 +1389,8 @@ struct xor_t : public base_transform<2, 2>
     }
 };
 
-template <typename LowLengths>
-CK_TILE_HOST_DEVICE static void print(const xor_t<LowLengths>& x)
+template <typename LowLengths, bool ApplyModulo = true>
+CK_TILE_HOST_DEVICE static void print(const xor_t<LowLengths, ApplyModulo>& x)
 {
     printf("xor_t{");
     printf("up_lengths_: ");
@@ -1737,10 +1744,10 @@ CK_TILE_HOST_DEVICE constexpr auto make_modulo_transform(const Modulus& modulus,
     return modulo<Modulus, UpLength>{modulus, up_length};
 }
 
-template <typename LowLengths>
+template <typename LowLengths, bool ApplyModulo = true>
 CK_TILE_HOST_DEVICE constexpr auto make_xor_transform(const LowLengths& low_lengths)
 {
-    return xor_t<LowLengths>{low_lengths};
+    return xor_t<LowLengths, ApplyModulo>{low_lengths};
 }
 
 template <typename LowLength, typename OffsetLength>
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index f7dc610717..6a9c9e3faf 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -18,6 +18,10 @@
 #include "ck_tile/core/utility/ignore.hpp"
 #include "ck_tile/core/arch/amd_buffer_coherence.hpp"
 
+#define HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN                        \
+    __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2f16) && \
+        __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2bf16)
+
 // This attribute gives a hint to the compiler that a branch is likely to be taken.
 // Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
 // have been generated.
@@ -2162,27 +2166,11 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
         }
         else if constexpr(N == 8)
         {
-#if 0
-            thread_buffer<fp16_t, 8> tmp{src_thread_data};
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
-                                                static_cast<index_t>(coherence));
-#else
             llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
                                                 static_cast<index_t>(coherence));
-#endif
         }
     }
     else if constexpr(std::is_same<T, bf16_t>::value) // bf16
@@ -2317,6 +2305,34 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer<T, N>& dst_thr
     }
 }
 
+template <typename T, index_t N>
+CK_TILE_DEVICE void
+amd_global_atomic_add_impl([[maybe_unused]] const thread_buffer<T, N>& src_thread_data,
+                           [[maybe_unused]] T* addr)
+{
+    static_assert((std::is_same<T, ck_tile::bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (std::is_same<T, ck_tile::fp16_t>::value && (N == 2 || N == 4 || N == 8)),
+                  "wrong! not implemented");
+
+#if HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN
+    if constexpr(__has_builtin(__builtin_amdgcn_global_atomic_fadd_v2bf16) &&
+                 std::is_same<T, ck_tile::bf16_t>::value)
+    {
+        static_for<0, N / 2, 1>{}([&](auto i) {
+            __builtin_amdgcn_global_atomic_fadd_v2bf16(
+                bit_cast<ck_tile::bf16x2_t*>(addr) + i,
+                src_thread_data.template get_as<ck_tile::bf16x2_t>()[i]);
+        });
+    }
+    else
+    {
+        static_assert(false, "Not supported!");
+    }
+#else
+    static_assert(false, "Not supported!");
+#endif
+}
+
 template <typename T, index_t N>
 CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer<T, N>& src_thread_data,
                                                int32x4_t dst_wave_buffer_resource,
@@ -2325,8 +2341,11 @@ CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer<T, N>& src_th
 {
     static_assert((std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
                       (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
-                      (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
-                      (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
+                      (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4))
+#if defined(__gfx950__)
+                      || (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8))
+#endif
+                      ,
                   "wrong! not implemented");
 
     if constexpr(std::is_same<T, float>::value)
@@ -2931,16 +2950,27 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_
                                           const bool dst_thread_element_valid,
                                           const index_t dst_element_space_size)
 {
-    const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
-
-    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+#if defined(__gfx942__)
+    if constexpr(std::is_same<T, bf16_t>::value)
+    {
+        if(dst_thread_element_valid)
+        {
+            amd_global_atomic_add_impl<T, N>(src_thread_data,
+                                             p_dst_wave + dst_thread_element_offset);
+        }
+    }
+    else
+    {
+#endif
+        const int32x4_t dst_wave_buffer_resource =
+            make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
 
+        index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
 #if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
-    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
+        uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
 
-    amd_buffer_atomic_add_impl<T, N>(
-        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+        amd_buffer_atomic_add_impl<T, N>(
+            src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
 #else
     if(dst_thread_element_valid)
     {
@@ -2948,6 +2978,9 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_
             src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
     }
 #endif
+#if defined(__gfx942__)
+    }
+#endif
 }
 
 template <typename T,
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 545ef73e46..8056b76af7 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -18,6 +18,10 @@
 #include "ck_tile/core/utility/ignore.hpp"
 #include "ck_tile/core/arch/amd_buffer_coherence.hpp"
 
+#define HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN                        \
+    __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2f16) && \
+        __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2bf16)
+
 using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
 
 namespace ck_tile {
@@ -1988,27 +1992,11 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
         }
         else if constexpr(N == 8)
         {
-#if 0
-            thread_buffer<fp16_t, 8> tmp{src_thread_data};
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
-                                                static_cast<index_t>(coherence));
-#else
             llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
                                                 static_cast<index_t>(coherence));
-#endif
         }
     }
     else if constexpr(std::is_same<T, bf16_t>::value) // bf16
@@ -2143,6 +2131,33 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer<T, N>& dst_thr
     }
 }
 
+template <typename T, index_t N>
+CK_TILE_DEVICE void
+amd_global_atomic_add_impl([[maybe_unused]] const thread_buffer<T, N>& src_thread_data,
+                           [[maybe_unused]] T* addr)
+{
+    static_assert((std::is_same<T, ck_tile::bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (std::is_same<T, ck_tile::fp16_t>::value && (N == 2 || N == 4 || N == 8)),
+                  "wrong! not implemented");
+
+#if HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN
+    if constexpr(std::is_same<T, ck_tile::bf16_t>::value)
+    {
+        static_for<0, N / 2, 1>{}([&](auto i) {
+            __builtin_amdgcn_global_atomic_fadd_v2bf16(
+                bit_cast<ck_tile::bf16x2_t*>(addr) + i,
+                src_thread_data.template get_as<ck_tile::bf16x2_t>()[i]);
+        });
+    }
+    else
+    {
+        static_assert(false, "Not supported!");
+    }
+#else
+    static_assert(false, "Not supported!");
+#endif
+}
+
 template <typename T, index_t N>
 CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer<T, N>& src_thread_data,
                                                int32x4_t dst_wave_buffer_resource,
@@ -2151,8 +2166,11 @@ CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer<T, N>& src_th
 {
     static_assert((std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
                       (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
-                      (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
-                      (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
+                      (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4))
+#if defined(__gfx950__)
+                      || (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8))
+#endif
+                      ,
                   "wrong! not implemented");
 
     if constexpr(std::is_same<T, float>::value)
@@ -2759,16 +2777,28 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_
                                           const bool dst_thread_element_valid,
                                           const index_t dst_element_space_size)
 {
-    const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+#if defined(__gfx942__)
+    if constexpr(std::is_same<T, bf16_t>::value)
+    {
+        if(dst_thread_element_valid)
+        {
+            amd_global_atomic_add_impl<T, N>(src_thread_data,
+                                             p_dst_wave + dst_thread_element_offset);
+        }
+    }
+    else
+    {
+#endif
+        const int32x4_t dst_wave_buffer_resource =
+            make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
 
-    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+        index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
 
 #if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
-    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
+        uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
 
-    amd_buffer_atomic_add_impl<T, N>(
-        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+        amd_buffer_atomic_add_impl<T, N>(
+            src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
 #else
     if(dst_thread_element_valid)
     {
@@ -2776,6 +2806,9 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_
             src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
     }
 #endif
+#if defined(__gfx942__)
+    }
+#endif
 }
 
 template <typename T,
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 62d7971a8a..417ec12c8c 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -88,6 +88,7 @@ enum struct amdgcn_target_id
     GFX1030        = 0x1030,
     GFX1031        = 0x1031,
     GFX1032        = 0x1032,
+    GFX1033        = 0x1033,
     GFX1034        = 0x1034,
     GFX1035        = 0x1035,
     GFX1036        = 0x1036,
@@ -284,6 +285,7 @@ constexpr auto get_compiler_target()
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
+    MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1033, GFX1033);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
@@ -351,6 +353,7 @@ CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target_id(char const*
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1030", GFX1030);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1031", GFX1031);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1032", GFX1032);
+    MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1033", GFX1033);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1034", GFX1034);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1035", GFX1035);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1036", GFX1036);
@@ -607,6 +610,7 @@ CK_TILE_HOST_DEVICE constexpr auto get_compiler_target()
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
+    MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1033, GFX1033);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
@@ -688,6 +692,7 @@ CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target(char const* tes
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1030", GFX1030);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1031", GFX1031);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1032", GFX1032);
+    MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1033", GFX1033);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1034", GFX1034);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1035", GFX1035);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1036", GFX1036);
@@ -1204,7 +1209,8 @@ enum LLVMSchedGroupMask : int32_t
     DS         = 1 << 7,
     DS_READ    = 1 << 8,
     DS_WRITE   = 1 << 9,
-    ALL        = (DS_WRITE << 1) - 1,
+    TRANS      = 1 << 10,
+    ALL        = (TRANS << 1) - 1,
 };
 
 CK_TILE_HOST_DEVICE static constexpr auto get_max_mem_vec_inst_width()
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index a057ae9052..036e241c95 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -15,8 +15,8 @@
 #define __gfx101__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
-    defined(__gfx10_3_generic__)
+    defined(__gfx1033__) || defined(__gfx1034__) || defined(__gfx1035__) || \
+    defined(__gfx1036__) || defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
@@ -405,6 +405,12 @@ struct amdgcn_compiler_target_state
     static constexpr bool CK_TILE_ARCH_GFX1032 = false;
 #endif // __gfx1032__
 
+#if defined(__gfx1033__)
+    static constexpr bool CK_TILE_ARCH_GFX1033 = true;
+#else
+    static constexpr bool CK_TILE_ARCH_GFX1033 = false;
+#endif // __gfx1033__
+
 #if defined(__gfx1034__)
     static constexpr bool CK_TILE_ARCH_GFX1034 = true;
 #else
@@ -537,6 +543,7 @@ CK_TILE_HOST_DEVICE static constexpr uint32_t count_values_of(T search, Ts... se
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1030,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1031,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1032,         \
+        amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1033,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1034,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1035,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1036,         \
diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp
index 8b273b691b..d6ba1efcbe 100644
--- a/include/ck_tile/core/container/array.hpp
+++ b/include/ck_tile/core/container/array.hpp
@@ -12,6 +12,9 @@
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
 namespace ck_tile {
 
 // use aggregate initialization for this type
@@ -81,42 +84,29 @@ struct array
             data[i] = static_cast<value_type>(c);
     }
 
-    // template <typename Y>
-    // CK_TILE_HOST_DEVICE constexpr array(const array& o)
-    // {
-    //     // static_assert(ArrayType::size() == size(), "wrong! size not the same");
-    //     __content = o.__content;
-    // }
-    // CK_TILE_HOST_DEVICE constexpr array& operator=(const array& o)
-    // {
-    //     // static_assert(ArrayType::size() == size(), "wrong! size not the same");
-    //     __content = o.__content;
-    //     return *this;
-    // }
-
     CK_TILE_HOST_DEVICE static constexpr auto size() { return N; }
     CK_TILE_HOST_DEVICE static constexpr bool is_static() { return is_static_v<value_type>; }
 
     // clang-format off
     CK_TILE_HOST_DEVICE constexpr auto& get()                                           { return data; }
     CK_TILE_HOST_DEVICE constexpr const auto& get() const                               { return data; }
-    CK_TILE_HOST_DEVICE constexpr auto& get(index_t i)                                  { return data[i]; }
-    CK_TILE_HOST_DEVICE constexpr const auto& get(index_t i) const                      { return data[i]; }
+    CK_TILE_HOST_DEVICE constexpr auto& get(index_t i) [[clang::lifetimebound]]                                 { return data[i]; }
+    CK_TILE_HOST_DEVICE constexpr const auto& get(index_t i) const [[clang::lifetimebound]]                     { return data[i]; }
     template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& get()                      { return data[I]; }
     template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& get() const          { return data[I]; }
-    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& get(number<I>)             { return data[I]; }
+    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& get(number<I>)[[clang::lifetimebound]]             { return data[I]; }
     template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& get(number<I>) const { return data[I]; }
 
     CK_TILE_HOST_DEVICE constexpr auto& at(index_t i)                                   { return get(i); }
-    CK_TILE_HOST_DEVICE constexpr const auto& at(index_t i) const                       { return get(i); }
-    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& at()                       { return get(I); }
-    template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& at() const           { return get(I); }
-    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& at(number<I>)              { return get(I); }
-    template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& at(number<I>) const  { return get(I); }
+    CK_TILE_HOST_DEVICE constexpr const auto& at(index_t i) const  [[clang::lifetimebound]]                     { return get(i); }
+    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& at() [[clang::lifetimebound]]                      { return get(I); }
+    template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& at() const [[clang::lifetimebound]]          { return get(I); }
+    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& at(number<I>) [[clang::lifetimebound]]              { return get(I); }
+    template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& at(number<I>) const [[clang::lifetimebound]] { return get(I); }
 
-    CK_TILE_HOST_DEVICE constexpr const value_type& operator[](index_t i) const { return get(i); }
-    CK_TILE_HOST_DEVICE constexpr value_type& operator[](index_t i)             { return get(i); }
-    CK_TILE_HOST_DEVICE constexpr value_type& operator()(index_t i)             { return get(i); }     // TODO: compatible
+    CK_TILE_HOST_DEVICE constexpr const value_type& operator[](index_t i) const [[clang::lifetimebound]] { return get(i); }
+    CK_TILE_HOST_DEVICE constexpr value_type& operator[](index_t i) [[clang::lifetimebound]]            { return get(i); }
+    CK_TILE_HOST_DEVICE constexpr value_type& operator()(index_t i) [[clang::lifetimebound]]            { return get(i); }     // TODO: compatible
 #if 0
     template <typename ArrayLike>
     CK_TILE_HOST_DEVICE constexpr auto operator=(const ArrayLike& arr)
@@ -244,13 +234,6 @@ CK_TILE_HOST_DEVICE constexpr details::return_type<D, Ts...> make_array(Ts&&...
     return {std::forward<Ts>(ts)...};
 }
 
-// // make empty array
-// template <typename T>
-// CK_TILE_HOST_DEVICE constexpr auto make_array()
-// {
-//     return array<T, 0>{};
-// }
-
 // compatible with old ck's initializer, make an array and fill it withe the last element from
 // initializer_list
 template <typename T, index_t Size>
@@ -305,3 +288,5 @@ CK_TILE_HOST_DEVICE constexpr auto to_array(const X& x)
 }
 
 } // namespace ck_tile
+
+#pragma clang diagnostic pop
diff --git a/include/ck_tile/core/container/container_helper.hpp b/include/ck_tile/core/container/container_helper.hpp
index 90579c0034..699f0c8a65 100644
--- a/include/ck_tile/core/container/container_helper.hpp
+++ b/include/ck_tile/core/container/container_helper.hpp
@@ -39,7 +39,7 @@ CK_TILE_HOST_DEVICE constexpr auto
 container_reorder_given_new2old(const array<TData, NSize>& old_array, sequence<IRs...> /*new2old*/)
 {
     static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");
-    static_assert(is_valid_sequence_map<sequence<IRs...>>{}, "wrong! invalid reorder map");
+    static_assert(is_valid_sequence_map<sequence<IRs...>>::value, "wrong! invalid reorder map");
     return make_array<remove_cvref_t<TData>>(old_array[IRs]...);
 }
 
@@ -89,7 +89,7 @@ CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_new2old(const tuple<T
 {
     static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");
 
-    static_assert(is_valid_sequence_map<sequence<IRs...>>{}, "wrong! invalid reorder map");
+    static_assert(is_valid_sequence_map<sequence<IRs...>>::value, "wrong! invalid reorder map");
 
     return make_tuple(old_tuple[number<IRs>{}]...);
 }
@@ -109,7 +109,7 @@ CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_new2old(sequence<Is..
 {
     static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
 
-    static_assert(is_valid_sequence_map<sequence<IRs...>>{}, "wrong! invalid reorder map");
+    static_assert(is_valid_sequence_map<sequence<IRs...>>::value, "wrong! invalid reorder map");
 
     return sequence<sequence<Is...>::at(number<IRs>{})...>{};
 }
@@ -120,7 +120,7 @@ CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_old2new(sequence<Is..
 {
     static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
 
-    static_assert(is_valid_sequence_map<sequence<IRs...>>{}, "wrong! invalid reorder map");
+    static_assert(is_valid_sequence_map<sequence<IRs...>>::value, "wrong! invalid reorder map");
 
     constexpr auto new2old = typename sequence_map_inverse<sequence<IRs...>>::type{};
 
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index 35858bf75e..4e94d6e902 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -144,9 +144,11 @@ struct sequence
         static_assert(MapOld2New::size() == size(),
                       "wrong! reorder map should have the same size as sequence to be rerodered");
 
-        static_assert(is_valid_sequence_map<MapOld2New>::value, "wrong! invalid reorder map");
+        static_assert(is_valid_sequence_map<remove_cvref_t<MapOld2New>>::value,
+                      "wrong! invalid reorder map");
 
-        return reorder_new_to_old(typename sequence_map_inverse<MapOld2New>::type{});
+        return reorder_new_to_old(
+            typename sequence_map_inverse<remove_cvref_t<MapOld2New>>::type{});
     }
 
     CK_TILE_HOST_DEVICE static constexpr auto reverse()
@@ -478,32 +480,6 @@ struct sequence_split
     using right_type = decltype(Seq::extract(range1{}));
 };
 
-#if 0
-// reverse sequence
-template <typename Seq>
-struct sequence_reverse
-{
-    static constexpr index_t NSize = Seq{}.size();
-
-    using seq_split = sequence_split<Seq, NSize / 2>;
-    using type      = typename sequence_merge<
-        typename sequence_reverse<typename seq_split::right_type>::type,
-        typename sequence_reverse<typename seq_split::left_type>::type>::type;
-};
-
-template <index_t I>
-struct sequence_reverse<sequence<I>>
-{
-    using type = sequence<I>;
-};
-
-template <index_t I0, index_t I1>
-struct sequence_reverse<sequence<I0, I1>>
-{
-    using type = sequence<I1, I0>;
-};
-#endif
-
 namespace detail {
 template <typename Id, index_t... Ns>
 struct seq_reverse;
@@ -548,163 +524,59 @@ struct sequence_reduce<Reduce, Seq>
 };
 #endif
 
-template <typename Values, typename Ids, typename Compare>
-struct sequence_sort_impl
+// Sorts a sequence using constexpr insertion sort. O(1) template instantiation
+// depth, replacing the recursive merge sort that created O(N log N) intermediate types.
+namespace detail {
+
+template <typename Values, typename Compare, typename IndexSeq>
+struct sequence_sort_helper;
+
+template <index_t... Vs, typename Compare, index_t... Idx>
+struct sequence_sort_helper<sequence<Vs...>, Compare, sequence<Idx...>>
 {
-    template <typename LeftValues,
-              typename LeftIds,
-              typename RightValues,
-              typename RightIds,
-              typename MergedValues,
-              typename MergedIds,
-              typename Comp>
-    struct sorted_sequence_merge_impl
+    struct sort_result
     {
-        static constexpr bool choose_left = LeftValues::front() < RightValues::front();
-
-        static constexpr index_t chosen_value =
-            choose_left ? LeftValues::front() : RightValues::front();
-        static constexpr index_t chosen_id = choose_left ? LeftIds::front() : RightIds::front();
-
-        using new_merged_values = decltype(MergedValues::push_back(number<chosen_value>{}));
-        using new_merged_ids    = decltype(MergedIds::push_back(number<chosen_id>{}));
-
-        using new_left_values = typename std::
-            conditional<choose_left, decltype(LeftValues::pop_front()), LeftValues>::type;
-        using new_left_ids =
-            typename std::conditional<choose_left, decltype(LeftIds::pop_front()), LeftIds>::type;
-
-        using new_right_values = typename std::
-            conditional<choose_left, RightValues, decltype(RightValues::pop_front())>::type;
-        using new_right_ids =
-            typename std::conditional<choose_left, RightIds, decltype(RightIds::pop_front())>::type;
-
-        using merge = sorted_sequence_merge_impl<new_left_values,
-                                                 new_left_ids,
-                                                 new_right_values,
-                                                 new_right_ids,
-                                                 new_merged_values,
-                                                 new_merged_ids,
-                                                 Comp>;
-        // this is output
-        using merged_values = typename merge::merged_values;
-        using merged_ids    = typename merge::merged_ids;
+        static_array<index_t, sizeof...(Vs)> values;
+        static_array<index_t, sizeof...(Vs)> ids;
     };
 
-    template <typename LeftValues,
-              typename LeftIds,
-              typename MergedValues,
-              typename MergedIds,
-              typename Comp>
-    struct sorted_sequence_merge_impl<LeftValues,
-                                      LeftIds,
-                                      sequence<>,
-                                      sequence<>,
-                                      MergedValues,
-                                      MergedIds,
-                                      Comp>
+    static constexpr sort_result compute()
     {
-        using merged_values = typename sequence_merge<MergedValues, LeftValues>::type;
-        using merged_ids    = typename sequence_merge<MergedIds, LeftIds>::type;
-    };
+        constexpr index_t n = sizeof...(Vs);
+        sort_result r{{{Vs...}}, {{Idx...}}};
+        // insertion sort — O(N^2) constexpr steps, O(1) template depth
+        for(index_t i = 1; i < n; ++i)
+        {
+            for(index_t j = i; j > 0 && Compare{}(r.values[j], r.values[j - 1]); --j)
+            {
+                auto tv         = r.values[j];
+                r.values[j]     = r.values[j - 1];
+                r.values[j - 1] = tv;
+                auto ti         = r.ids[j];
+                r.ids[j]        = r.ids[j - 1];
+                r.ids[j - 1]    = ti;
+            }
+        }
+        return r;
+    }
 
-    template <typename RightValues,
-              typename RightIds,
-              typename MergedValues,
-              typename MergedIds,
-              typename Comp>
-    struct sorted_sequence_merge_impl<sequence<>,
-                                      sequence<>,
-                                      RightValues,
-                                      RightIds,
-                                      MergedValues,
-                                      MergedIds,
-                                      Comp>
-    {
-        using merged_values = typename sequence_merge<MergedValues, RightValues>::type;
-        using merged_ids    = typename sequence_merge<MergedIds, RightIds>::type;
-    };
-
-    template <typename LeftValues,
-              typename LeftIds,
-              typename RightValues,
-              typename RightIds,
-              typename Comp>
-    struct sorted_sequence_merge
-    {
-        using merge = sorted_sequence_merge_impl<LeftValues,
-                                                 LeftIds,
-                                                 RightValues,
-                                                 RightIds,
-                                                 sequence<>,
-                                                 sequence<>,
-                                                 Comp>;
-
-        using merged_values = typename merge::merged_values;
-        using merged_ids    = typename merge::merged_ids;
-    };
-
-    static constexpr index_t nsize = Values::size();
-
-    using split_unsorted_values = sequence_split<Values, nsize / 2>;
-    using split_unsorted_ids    = sequence_split<Ids, nsize / 2>;
-
-    using left_unsorted_values = typename split_unsorted_values::left_type;
-    using left_unsorted_ids    = typename split_unsorted_ids::left_type;
-    using left_sort          = sequence_sort_impl<left_unsorted_values, left_unsorted_ids, Compare>;
-    using left_sorted_values = typename left_sort::sorted_values;
-    using left_sorted_ids    = typename left_sort::sorted_ids;
-
-    using right_unsorted_values = typename split_unsorted_values::right_type;
-    using right_unsorted_ids    = typename split_unsorted_ids::right_type;
-    using right_sort = sequence_sort_impl<right_unsorted_values, right_unsorted_ids, Compare>;
-    using right_sorted_values = typename right_sort::sorted_values;
-    using right_sorted_ids    = typename right_sort::sorted_ids;
-
-    using merged_sorted = sorted_sequence_merge<left_sorted_values,
-                                                left_sorted_ids,
-                                                right_sorted_values,
-                                                right_sorted_ids,
-                                                Compare>;
-
-    using sorted_values = typename merged_sorted::merged_values;
-    using sorted_ids    = typename merged_sorted::merged_ids;
+    static constexpr sort_result sorted = compute();
+    using sorted_values                 = sequence<sorted.values[Idx]...>;
+    using sorted_ids                    = sequence<sorted.ids[Idx]...>;
 };
 
-template <index_t ValueX, index_t ValueY, index_t IdX, index_t IdY, typename Compare>
-struct sequence_sort_impl<sequence<ValueX, ValueY>, sequence<IdX, IdY>, Compare>
-{
-    static constexpr bool choose_x = Compare{}(ValueX, ValueY);
-
-    using sorted_values = typename std::
-        conditional<choose_x, sequence<ValueX, ValueY>, sequence<ValueY, ValueX>>::type;
-    using sorted_ids =
-        typename std::conditional<choose_x, sequence<IdX, IdY>, sequence<IdY, IdX>>::type;
-};
-
-template <index_t Value, index_t Id, typename Compare>
-struct sequence_sort_impl<sequence<Value>, sequence<Id>, Compare>
-{
-    using sorted_values = sequence<Value>;
-    using sorted_ids    = sequence<Id>;
-};
-
-template <typename Compare>
-struct sequence_sort_impl<sequence<>, sequence<>, Compare>
-{
-    using sorted_values = sequence<>;
-    using sorted_ids    = sequence<>;
-};
+} // namespace detail
 
 template <typename Values, typename Compare>
 struct sequence_sort
 {
-    using unsorted_ids = typename arithmetic_sequence_gen<0, Values::size(), 1>::type;
-    using sort         = sequence_sort_impl<Values, unsorted_ids, Compare>;
+    static constexpr index_t n = Values::size();
+    using idx_seq              = make_index_sequence<n>;
 
-    // this is output
-    using type                = typename sort::sorted_values;
-    using sorted2unsorted_map = typename sort::sorted_ids;
+    using helper = detail::sequence_sort_helper<remove_cvref_t<Values>, Compare, idx_seq>;
+
+    using type                = typename helper::sorted_values;
+    using sorted2unsorted_map = typename helper::sorted_ids;
 };
 
 template <typename Values, typename Less, typename Equal>
@@ -782,10 +654,42 @@ struct sequence_unique_sort
     using sorted2unsorted_map = typename uniquify::uniquified_ids;
 };
 
+// Validates that a sequence is a permutation of {0, 1, ..., N-1}.
+// Uses a constexpr loop instead of instantiating sequence_sort.
+namespace detail {
+
+template <index_t... Is>
+constexpr bool check_valid_sequence_map()
+{
+    constexpr index_t n = sizeof...(Is);
+    if constexpr(n == 0)
+    {
+        return true;
+    }
+    else
+    {
+        constexpr index_t vals[] = {Is...};
+        static_array<bool, n> seen{};
+        for(index_t i = 0; i < n; ++i)
+        {
+            if(vals[i] < 0 || vals[i] >= n || seen[vals[i]])
+                return false;
+            seen[vals[i]] = true;
+        }
+        return true;
+    }
+}
+
+} // namespace detail
+
 template <typename SeqMap>
-struct is_valid_sequence_map
-    : std::is_same<typename arithmetic_sequence_gen<0, SeqMap::size(), 1>::type,
-                   typename sequence_sort<SeqMap, less<index_t>>::type>
+struct is_valid_sequence_map : std::false_type
+{
+};
+
+template <index_t... Is>
+struct is_valid_sequence_map<sequence<Is...>>
+    : std::integral_constant<bool, detail::check_valid_sequence_map<Is...>()>
 {
 };
 
diff --git a/include/ck_tile/core/container/static_array.hpp b/include/ck_tile/core/container/static_array.hpp
index 0d60d5bc91..6794e8e169 100644
--- a/include/ck_tile/core/container/static_array.hpp
+++ b/include/ck_tile/core/container/static_array.hpp
@@ -6,6 +6,9 @@
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
 namespace ck_tile {
 
 /**
@@ -24,9 +27,17 @@ struct static_array
     T elems[N > 0 ? N : 1];
 
     // Basic constexpr accessors
-    CK_TILE_HOST_DEVICE constexpr const T& operator[](index_t i) const { return elems[i]; }
-    CK_TILE_HOST_DEVICE constexpr T& operator[](index_t i) { return elems[i]; }
+    CK_TILE_HOST_DEVICE constexpr const T& operator[](index_t i) const [[clang::lifetimebound]]
+    {
+        return elems[i];
+    }
+    CK_TILE_HOST_DEVICE constexpr T& operator[](index_t i) [[clang::lifetimebound]]
+    {
+        return elems[i];
+    }
 
     CK_TILE_HOST_DEVICE static constexpr index_t size() { return N; }
 };
 } // namespace ck_tile
+
+#pragma clang diagnostic pop
diff --git a/include/ck_tile/core/container/statically_indexed_array.hpp b/include/ck_tile/core/container/statically_indexed_array.hpp
index d35934ab04..111b8a8c58 100644
--- a/include/ck_tile/core/container/statically_indexed_array.hpp
+++ b/include/ck_tile/core/container/statically_indexed_array.hpp
@@ -24,18 +24,4 @@ using statically_indexed_array = array<T, N>;
 #endif
 
 // consider always use ck_tile::array for this purpose
-#if 0
-template <typename X, typename... Xs>
-CK_TILE_HOST_DEVICE constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs)
-{
-    return statically_indexed_array<X, sizeof...(Xs) + 1>(x, static_cast<X>(xs)...);
-}
-
-// make empty statically_indexed_array
-template <typename X>
-CK_TILE_HOST_DEVICE constexpr auto make_statically_indexed_array()
-{
-    return statically_indexed_array<X, 0>();
-}
-#endif
 } // namespace ck_tile
diff --git a/include/ck_tile/core/container/thread_buffer.hpp b/include/ck_tile/core/container/thread_buffer.hpp
index 8785a301fb..58e417a612 100644
--- a/include/ck_tile/core/container/thread_buffer.hpp
+++ b/include/ck_tile/core/container/thread_buffer.hpp
@@ -7,6 +7,9 @@
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/container/tuple.hpp"
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
 namespace ck_tile {
 
 #if CK_TILE_THREAD_BUFFER_DEFAULT == CK_TILE_THREAD_BUFFER_USE_TUPLE
@@ -20,18 +23,6 @@ CK_TILE_HOST_DEVICE constexpr auto make_thread_buffer(Ts&&... ts)
 }
 #else
 
-#if 0
-template <typename T, index_t N>
-using thread_buffer = array<T, N>;
-
-template <typename... Ts>
-CK_TILE_HOST_DEVICE constexpr auto make_thread_buffer(Ts&&... ts)
-{
-    return make_array(ts...);
-}
-
-#endif
-
 // clang-format off
 template<typename T_, index_t N_>
 struct thread_buffer {
@@ -51,17 +42,17 @@ struct thread_buffer {
     CK_TILE_HOST_DEVICE static constexpr auto size() { return N; }
     CK_TILE_HOST_DEVICE auto & get() {return data; }
     CK_TILE_HOST_DEVICE const auto & get() const {return data; }
-    CK_TILE_HOST_DEVICE auto & get(index_t i) {return data[i]; }
-    CK_TILE_HOST_DEVICE const auto & get(index_t i) const {return data[i]; }
-    CK_TILE_HOST_DEVICE constexpr const auto& operator[](index_t i) const { return get(i); }
-    CK_TILE_HOST_DEVICE constexpr auto& operator[](index_t i)             { return get(i); }
-    CK_TILE_HOST_DEVICE constexpr auto& operator()(index_t i)             { return get(i); }     // TODO: compatible
-    CK_TILE_HOST_DEVICE constexpr auto& at(index_t i)                                   { return get(i); }
-    CK_TILE_HOST_DEVICE constexpr const auto& at(index_t i) const                       { return get(i); }
-    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& at()                       { return get(I); }
-    template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& at() const           { return get(I); }
-    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& at(number<I>)              { return get(I); }
-    template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& at(number<I>) const  { return get(I); }
+    CK_TILE_HOST_DEVICE auto & get(index_t i) [[clang::lifetimebound]] {return data[i]; }
+    CK_TILE_HOST_DEVICE const auto & get(index_t i) const [[clang::lifetimebound]] {return data[i]; }
+    CK_TILE_HOST_DEVICE constexpr const auto& operator[](index_t i) const [[clang::lifetimebound]] {return get(i); }
+    CK_TILE_HOST_DEVICE constexpr auto& operator[](index_t i) [[clang::lifetimebound]]             { return get(i); }
+    CK_TILE_HOST_DEVICE constexpr auto& operator()(index_t i) [[clang::lifetimebound]]            { return get(i); }     // TODO: compatible
+    CK_TILE_HOST_DEVICE constexpr auto& at(index_t i) [[clang::lifetimebound]]                                  { return get(i); }
+    CK_TILE_HOST_DEVICE constexpr const auto& at(index_t i) const [[clang::lifetimebound]]                      { return get(i); }
+    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& at() [[clang::lifetimebound]]                       { return get(I); }
+    template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& at() const [[clang::lifetimebound]]         { return get(I); }
+    template <index_t I> CK_TILE_HOST_DEVICE constexpr auto& at(number<I>) [[clang::lifetimebound]]              { return get(I); }
+    template <index_t I> CK_TILE_HOST_DEVICE constexpr const auto& at(number<I>) const [[clang::lifetimebound]] { return get(I); }
     
     template <typename X_,
               typename std::enable_if<has_same_scalar_type<value_type, X_>::value, bool>::type = false>
@@ -100,25 +91,6 @@ struct thread_buffer {
         return vx.data;
     }
 
-#if 0
-    template <typename X_,
-              index_t Is,
-              typename std::enable_if<has_same_scalar_type<value_type, X_>::value, bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void _set_as(number<Is> is, X_ x)
-    {
-        using X = remove_cvref_t<X_>;
-
-        constexpr index_t kSPerX = vector_traits<X>::vector_size;
-
-        union {
-            X_ data;
-            tuple_array<value_type, kSPerX> sub_data;
-        } vx {x};
-
-        static_for<0, kSPerX, 1>{}(
-           [&](auto j) { operator()((is * number<sizeof(X_)/sizeof(value_type)>{}) + j) = vx.sub_data[j]; });
-    }
-#endif
 
 
 #define TB_COMMON_AS() \
@@ -174,3 +146,5 @@ struct vector_traits<thread_buffer<T, N>, std::enable_if_t<std::is_class_v<T>>>
 #endif
 
 } // namespace ck_tile
+
+#pragma clang diagnostic pop
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index 97d5ae10df..d7da0e1467 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -292,9 +292,6 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
     // below function should be used under tuple_array<> type, no extra check will perform here
     template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as()                            { return reinterpret_cast<tuple_array<Tx, size()>&>(*this); }
     template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as() const                      { return reinterpret_cast<const tuple_array<Tx, size()>&>(*this); }
-    // below index is for index *AFTER* type convert, not before
-    //template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(index_t i)                   { TP_COM_(); return reinterpret_cast<tuple_array<Tx, size()>&>(*this).at(i); }
-    //template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(index_t i) const             { TP_COM_(); return reinterpret_cast<const tuple_array<Tx, size()>&>(*this).at(i); }
     template <typename Tx, index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(number<I>)        { TP_COM_(); return reinterpret_cast<tuple_array<Tx, size()>&>(*this).at(number<I>{}); }
     template <typename Tx, index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(number<I>) const  { TP_COM_(); return reinterpret_cast<const tuple_array<Tx, size()>&>(*this).at(number<I>{}); }
 
@@ -333,13 +330,6 @@ struct vector_traits<tuple<T...>, void>
     static constexpr index_t vector_size = sizeof...(T);
 };
 
-// template <class... T>
-// CK_TILE_HOST_DEVICE constexpr
-// tuple<T...>
-// make_tuple(T const&... t)
-// {
-//     return {t...};
-// }
 template <typename... Xs>
 CK_TILE_HOST_DEVICE constexpr bool operator==(const tuple<Xs...>& a, const tuple<Xs...>& b)
 {
diff --git a/include/ck_tile/core/numeric/half.hpp b/include/ck_tile/core/numeric/half.hpp
index b6a7e86d3c..c172f48cad 100644
--- a/include/ck_tile/core/numeric/half.hpp
+++ b/include/ck_tile/core/numeric/half.hpp
@@ -264,93 +264,6 @@ bool operator>(const half_t& x, const half_t& y) { return __hgt(x.to_fp16(), y.t
 CK_TILE_DEVICE
 bool operator>=(const half_t& x, const half_t& y) { return __hge(x.to_fp16(), y.to_fp16()); }
 
-#if 0
-CK_TILE_DEVICE
-half_t operator+(const half_t& x, const half_t& y)
-{
-    return half_t(__hadd(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t operator-(const half_t& x) { return half_t(__hneg(x.to_fp16())); }
-
-CK_TILE_DEVICE
-half_t operator-(const half_t& x, const half_t& y)
-{
-    return half_t(__hsub(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t operator*(const half_t& x, const half_t& y)
-{
-    return half_t(__hmul(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t operator/(const half_t& x, const half_t& y)
-{
-    return half_t(__hdiv(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t& operator+=(half_t& x, const half_t& y)
-{
-    x = half_t(__hadd(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator-=(half_t& x, const half_t& y)
-{
-    x = half_t(__hsub(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator*=(half_t& x, const half_t& y)
-{
-    x = half_t(__hmul(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator/=(half_t& x, const half_t& y)
-{
-    x = half_t(__hdiv(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator++(half_t& x)
-{
-    x = half_t(__hadd(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator--(half_t& x)
-{
-    x = half_t(__hsub(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t operator++(half_t& x, int)
-{
-    half_t y(x);
-    x = half_t(__hadd(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return y;
-}
-
-CK_TILE_DEVICE
-half_t operator--(half_t& x, int)
-{
-    half_t y(x);
-    x = half_t(__hsub(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return y;
-}
-#endif
-
 #if CK_TILE_USE_CUSTOM_DATA_TYPE
 CK_TILE_ARITHMETIC_USING_FLOAT(CK_TILE_HOST, half_t)
 #endif
diff --git a/include/ck_tile/core/numeric/int8.hpp b/include/ck_tile/core/numeric/int8.hpp
index aa9f820c17..7b0f102f2b 100644
--- a/include/ck_tile/core/numeric/int8.hpp
+++ b/include/ck_tile/core/numeric/int8.hpp
@@ -73,27 +73,6 @@ struct numeric<int8_t>
     CK_TILE_HOST_DEVICE static constexpr int8_t zero() { return 0; }
 };
 
-#if 0
-
-template <>
-struct numeric_traits<int8_t>
-{
-    static constexpr int exp            = 5;
-    static constexpr int mant           = 10;
-    static constexpr int bias           = 15;
-    static constexpr uint16_t nan_mask  = 0x7C00;
-    static constexpr uint16_t head_mask = 0xFC00;
-    static constexpr uint16_t mant_mask = 0x3FF;
-    static constexpr uint16_t exp_mask  = 0x1F;
-    static constexpr uint32_t Inf       = 0x7C00;
-    static constexpr uint32_t NegInf    = 0xFC00;
-    static constexpr uint32_t NaN       = 0x7C01;
-    static constexpr uint32_t Neg0      = 0x8000;
-    static constexpr int PackedSize           = 1;
-    using bitwise_type                  = uint16_t;
-};
-#endif
-
 CK_TILE_HOST_DEVICE
 constexpr float int8_to_float(const int8_t& x) { return static_cast<float>(x); }
 
diff --git a/include/ck_tile/core/numeric/pk_fp6.hpp b/include/ck_tile/core/numeric/pk_fp6.hpp
index a8b1d2eea1..0de61f6b1f 100644
--- a/include/ck_tile/core/numeric/pk_fp6.hpp
+++ b/include/ck_tile/core/numeric/pk_fp6.hpp
@@ -22,10 +22,7 @@ struct pk_fp6_t
     static constexpr index_t vector_size = (packed_size * num_bits_elem) / num_bits_vec_elem;
     element_type data_[vector_size]; // packed data
     using type = pk_fp6_t<packed_size>;
-
-    CK_TILE_HOST_DEVICE constexpr pk_fp6_t() : data_{element_type{}} {}
-
-    CK_TILE_HOST_DEVICE constexpr explicit pk_fp6_t(int value)
+    CK_TILE_HOST_DEVICE constexpr explicit pk_fp6_t(int value = 0)
     {
         for(size_t i = 0; i < vector_size; ++i)
         {
@@ -62,14 +59,13 @@ struct pk_fp6_t
         const int bit_offset = bit_pos % num_bits_vec_elem;
         const int overhang   = bit_offset + num_bits_elem - num_bits_vec_elem;
 
-        uint32_t bits = static_cast<uint32_t>(pk.data_[arr_idx]) >> bit_offset;
+        int32_t bits = pk.data_[arr_idx] >> bit_offset;
         if(overhang > 0 && (arr_idx + 1) < vector_size)
         {
-            bits |= (static_cast<uint32_t>(pk.data_[arr_idx + 1]) & ((1u << overhang) - 1))
-                    << (num_bits_elem - overhang);
+            bits |= (pk.data_[arr_idx + 1] & ((1u << overhang) - 1)) << (num_bits_elem - overhang);
         }
 
-        return static_cast<int32_t>(bits & 0x3F);
+        return bits & 0x3F;
     }
 
     CK_TILE_HOST_DEVICE int32_t unpack(const index_t i) const { return unpack(*this, i); }
@@ -101,22 +97,6 @@ struct pk_fp6_t
         }
         return sign == 1 ? -1 * result : result;
     }
-
-    CK_TILE_HOST static int32_t float_to_fp6_e2m3(float val)
-    {
-        int32_t best   = 0;
-        float best_err = 1e30f;
-        for(int32_t i = 0; i < 64; i++)
-        {
-            float err = std::fabs(val - fp6_e2m3_to_float(i));
-            if(err < best_err)
-            {
-                best     = i;
-                best_err = err;
-            }
-        }
-        return best;
-    }
 };
 
 using pk_fp6x16_t = pk_fp6_t<16>;
@@ -125,7 +105,5 @@ template <>
 struct numeric_traits<pk_fp6x16_t>
 {
     static constexpr int PackedSize = 16;
-    static constexpr int exp        = 2;
-    static constexpr int mant       = 3;
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index bba9fd7fb0..1705f5b0f2 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -19,14 +19,6 @@
 
 namespace ck_tile {
 
-// buffer_load_dwordx3 to LDS uses a fixed 16-byte per-thread stride,
-// padding each 12-byte element to 16 bytes in LDS.
-template <typename T>
-CK_TILE_HOST_DEVICE constexpr index_t lds_padded_sizeof()
-{
-    return (sizeof(T) == 12) ? 16 : sizeof(T);
-}
-
 // T may be scalar or vector
 // X may be scalar or vector
 // T and X have same scalar type
@@ -638,7 +630,7 @@ struct buffer_view<address_space_enum::global,
             std::is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
             std::is_same_v<remove_cvref_t<scalar_t>, float> ||
             (std::is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0)
-#if defined(__gfx950__) // only gfx950 support atomic_pk_add_bf16
+#if defined(__gfx942__) || defined(__gfx950__) // only gfx942 and gfx950 support atomic_pk_add_bf16
             ||
             (std::is_same_v<remove_cvref_t<scalar_t>, bfloat16_t> && scalar_per_x_vector % 2 == 0)
 #endif
@@ -650,7 +642,7 @@ struct buffer_view<address_space_enum::global,
         bool constexpr use_amd_buffer_addressing =
             std::is_same_v<remove_cvref_t<scalar_t>, float> ||
             (std::is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0)
-#if defined(__gfx950__) // only gfx950 support atomic_pk_add_bf16
+#if defined(__gfx942__) || defined(__gfx950__) // only gfx942 and gfx950 support atomic_pk_add_bf16
             ||
             (std::is_same_v<remove_cvref_t<scalar_t>, bfloat16_t> && scalar_per_x_vector % 2 == 0)
 #endif
@@ -848,10 +840,7 @@ struct buffer_view<address_space_enum::lds,
             {
                 using buf_t = ext_vector_t<typename vector_traits<remove_cvref_t<T>>::scalar_type,
                                            scalar_per_t_vector * scalar_per_x_vector>;
-                constexpr index_t padded_stride = lds_padded_sizeof<T>();
-                const char* base =
-                    reinterpret_cast<const char*>(p_data_) + (i + linear_offset) * padded_stride;
-                auto rtn = *c_style_pointer_cast<const buf_t*>(base);
+                auto rtn    = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
                 return bit_cast<X>(rtn);
             }
 #endif
@@ -883,8 +872,7 @@ struct buffer_view<address_space_enum::lds,
                                           bool /*is_valid_element*/,
                                           bool_constant<pre_nop> = {}) const
     {
-        constexpr index_t padded_stride = lds_padded_sizeof<T>();
-        smem_load<sizeof(X)>{}(dst, v_offset * padded_stride, i_offset * padded_stride);
+        smem_load<sizeof(X)>{}(dst, v_offset * sizeof(T), i_offset * sizeof(T));
     }
 
     template <typename X,
diff --git a/include/ck_tile/core/tensor/sweep_tile.hpp b/include/ck_tile/core/tensor/sweep_tile.hpp
index 1947ce0289..35440f10f8 100644
--- a/include/ck_tile/core/tensor/sweep_tile.hpp
+++ b/include/ck_tile/core/tensor/sweep_tile.hpp
@@ -295,10 +295,6 @@ struct tile_sweeper
     F f;
 };
 
-// partial deduction is not allowed
-// template <typename T, typename F, typename U>
-// tile_sweeper(const F&, U = {})->tile_sweeper<T, F, U>;
-
 // deduction guide
 template <typename T,
           typename F,
diff --git a/include/ck_tile/core/tensor/tensor_adaptor.hpp b/include/ck_tile/core/tensor/tensor_adaptor.hpp
index e6cdb66ef9..56c62a29ee 100644
--- a/include/ck_tile/core/tensor/tensor_adaptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp
@@ -376,9 +376,10 @@ CK_TILE_HOST_DEVICE constexpr auto make_single_stage_tensor_adaptor(const Transf
     constexpr auto all_up_dim_new_top_ids = unpack(
         [](auto&&... xs) constexpr { return merge_sequences(xs...); }, UpperDimensionNewTopIdss{});
 
-    static_assert(is_valid_sequence_map<decltype(all_low_dim_old_top_ids)>::value &&
-                      is_valid_sequence_map<decltype(all_up_dim_new_top_ids)>::value,
-                  "wrong!");
+    static_assert(
+        is_valid_sequence_map<remove_cvref_t<decltype(all_low_dim_old_top_ids)>>::value &&
+            is_valid_sequence_map<remove_cvref_t<decltype(all_up_dim_new_top_ids)>>::value,
+        "wrong!");
 
     constexpr index_t ndim_old_top = all_low_dim_old_top_ids.size();
     constexpr index_t ndim_new_top = all_up_dim_new_top_ids.size();
@@ -443,8 +444,8 @@ transform_tensor_adaptor(const OldTensorAdaptor& old_tensor_adaptor,
         constexpr auto all_new_top_ids = unpack([](auto... xs) { return merge_sequences(xs...); },
                                                 NewUpperDimensionNewTopIdss{});
 
-        static_assert(is_valid_sequence_map<decltype(all_old_top_ids)>::value &&
-                          is_valid_sequence_map<decltype(all_new_top_ids)>::value,
+        static_assert(is_valid_sequence_map<remove_cvref_t<decltype(all_old_top_ids)>>::value &&
+                          is_valid_sequence_map<remove_cvref_t<decltype(all_new_top_ids)>>::value,
                       "wrong!");
     }
 
diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp
index aa5714e5c2..beafc03115 100644
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -454,45 +454,6 @@ struct tile_distribution_detail
 
 } // namespace detail
 
-#if 0
-// this returns a constexpr tile_distribution
-template <typename StaticTileDistributionEncoding_>
-CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistributionEncoding_)
-{
-    using DstrEncode = remove_cvref_t<StaticTileDistributionEncoding_>;
-
-    constexpr auto adaptor_impl =
-        detail::make_adaptor_encoding_for_tile_distribution(StaticTileDistributionEncoding_{});
-
-    constexpr auto ps_ys_to_xs_adaptor_impl          = adaptor_impl.template at<0>();
-    constexpr auto ys_to_d_adaptor_impl              = adaptor_impl.template at<1>();
-    constexpr index_t d_length                       = adaptor_impl.template at<2>();
-    constexpr auto rh_major_minor_to_hidden_ids_impl = adaptor_impl.template at<3>();
-
-    constexpr auto ps_ys_to_xs_adaptor =
-        CONSTRUCT_TENSOR_ADAPTOR_FROM_ENCODING(ps_ys_to_xs_adaptor_impl);
-
-    constexpr auto ys_to_d_adaptor = CONSTRUCT_TENSOR_ADAPTOR_FROM_ENCODING(ys_to_d_adaptor_impl);
-
-    constexpr auto ys_to_d_descriptor =
-        make_tensor_descriptor_from_adaptor(ys_to_d_adaptor, d_length);
-
-    //
-    constexpr index_t ndim_rh_major = DstrEncode::detail::ndim_rh_major_;
-    constexpr auto ndims_rhs_minor  = DstrEncode::detail::ndims_rhs_minor_;
-
-    constexpr auto rh_major_minor_to_hidden_ids =
-        TO_TUPLE_OF_SEQUENCE(rh_major_minor_to_hidden_ids_impl, ndim_rh_major, ndims_rhs_minor);
-
-    return tile_distribution<
-        remove_cvref_t<decltype(ps_ys_to_xs_adaptor)>,
-        remove_cvref_t<decltype(ys_to_d_descriptor)>,
-        remove_cvref_t<DstrEncode>,
-        detail::tile_distribution_detail<remove_cvref_t<decltype(rh_major_minor_to_hidden_ids)>>>{
-        ps_ys_to_xs_adaptor, ys_to_d_descriptor};
-}
-#endif
-
 // this returns a static tile_distribution
 template <typename StaticTileDistributionEncoding_>
 CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistributionEncoding_)
diff --git a/include/ck_tile/core/tensor/tile_scatter_gather.hpp b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
index 2c840b245b..aa29345892 100644
--- a/include/ck_tile/core/tensor/tile_scatter_gather.hpp
+++ b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
@@ -631,24 +631,21 @@ struct tile_scatter_gather
         // issues * warps * lanes
         static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
 
-        // buffer load with dwordx3 requires 128-bit alignment
-        constexpr index_t lds_stride = lds_padded_sizeof<LdsDataType>();
-
         const index_t size_per_buf =
             lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
                 make_tuple(number<0>{}, number<0>{}, number<0>{})) *
-            lds_stride;
+            sizeof(LdsDataType);
 
         const index_t size_per_wave =
             lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
                 make_tuple(number<0>{}, number<1>{}, number<0>{})) *
-                lds_stride -
+                sizeof(LdsDataType) -
             size_per_buf;
 
         const index_t size_per_issue =
             lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
                 make_tuple(number<1>{}, number<0>{}, number<0>{})) *
-                lds_stride -
+                sizeof(LdsDataType) -
             size_per_buf;
 
         const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
@@ -783,12 +780,9 @@ struct tile_scatter_gather
                     make_tensor_coordinate(tensor_descriptor, lds_bottom_tensor_thread_idx);
 
                 // Calculate SMEM address using base pointer
-                // Use byte arithmetic for dwordx3 padding (12-byte elements use 16-byte LDS stride)
-                CK_TILE_LDS_ADDR LdsDataType* smem =
-                    reinterpret_cast<CK_TILE_LDS_ADDR LdsDataType*>(
-                        reinterpret_cast<CK_TILE_LDS_ADDR char*>(lds_base_ptr) +
-                        (lds_coord.get_offset() + lds_ys_offset) / Traits::PackedSize *
-                            lds_padded_sizeof<LdsDataType>());
+                CK_TILE_LDS_ADDR LdsDataType* smem = lds_base_ptr +
+                                                     lds_coord.get_offset() / Traits::PackedSize +
+                                                     lds_ys_offset / Traits::PackedSize;
 
                 const auto dram_ys_offset = [&]() {
                     if constexpr(static_move_ys)
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index 4e194a3f54..3e28544509 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -501,23 +501,21 @@ struct tile_window_with_static_distribution
         // issues * warps * lanes
         static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
 
-        constexpr index_t lds_stride = lds_padded_sizeof<LdsDataType>();
-
         const index_t size_per_buf =
             lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
                 make_tuple(number<0>{}, number<0>{}, number<0>{})) *
-            lds_stride;
+            sizeof(LdsDataType);
 
         const index_t size_per_wave =
             lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
                 make_tuple(number<0>{}, number<1>{}, number<0>{})) *
-                lds_stride -
+                sizeof(LdsDataType) -
             size_per_buf;
 
         const index_t size_per_issue =
             lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
                 make_tuple(number<1>{}, number<0>{}, number<0>{})) *
-                lds_stride -
+                sizeof(LdsDataType) -
             size_per_buf;
 
         // Use VALU so the compiler can optimize redundant/repeated computations
@@ -630,12 +628,8 @@ struct tile_window_with_static_distribution
                     make_tensor_coordinate(tensor_descriptor, lds_bottom_tensor_thread_idx);
 
                 // Calculate SMEM address using base pointer
-                // Use byte arithmetic for dwordx3 padding (12-byte elements use 16-byte LDS stride)
                 CK_TILE_LDS_ADDR LdsDataType* smem =
-                    reinterpret_cast<CK_TILE_LDS_ADDR LdsDataType*>(
-                        reinterpret_cast<CK_TILE_LDS_ADDR char*>(lds_base_ptr) +
-                        (lds_coord.get_offset() + lds_ys_offset) / Traits::PackedSize *
-                            lds_padded_sizeof<LdsDataType>());
+                    lds_base_ptr + (lds_coord.get_offset() + lds_ys_offset) / Traits::PackedSize;
 
                 const auto dram_ys_offset = [&]() {
                     if constexpr(static_move_ys)
diff --git a/include/ck_tile/core/utility/functional.hpp b/include/ck_tile/core/utility/functional.hpp
index ae79d575a8..032be236b6 100644
--- a/include/ck_tile/core/utility/functional.hpp
+++ b/include/ck_tile/core/utility/functional.hpp
@@ -135,65 +135,147 @@ struct idx_identity
 
 namespace detail {
 
-// RemainLengths: sequence<...>
-// Orders: sequence<...>
-template <class RemainLengths, class Orders>
-struct static_ford_impl
+// Computes the inverse of a permutation as a constexpr array.
+// Avoids the sequence_map_inverse -> is_valid_sequence_map -> sequence_sort chain.
+template <class Perm>
+struct inverse_perm;
+
+template <index_t... Ps>
+struct inverse_perm<sequence<Ps...>>
 {
-    CK_TILE_HOST_DEVICE constexpr static_ford_impl()
+    static constexpr auto compute()
     {
-        static_assert(RemainLengths::size() > 0, "wrong! should not get here");
+        constexpr index_t n = sizeof...(Ps);
+        static_array<index_t, n> result{};
+        constexpr index_t input[] = {Ps...};
+        for(index_t i = 0; i < n; ++i)
+        {
+            result[input[i]] = i;
+        }
+        return result;
+    }
+    static constexpr auto value = compute();
+};
+
+// Decomposes a linear index into multi-dimensional indices using pre-computed
+// strides. Uses a single flat static_for instead of recursive nesting, which
+// eliminates intermediate lambda closure instantiations.
+template <class OrderedLengths, class IndexSeq>
+struct index_decomposer;
+
+template <index_t... Ls, index_t... Is>
+struct index_decomposer<sequence<Ls...>, sequence<Is...>>
+{
+    static constexpr index_t n_dim                        = sizeof...(Ls);
+    static constexpr static_array<index_t, n_dim> lengths = {{Ls...}};
+
+    static constexpr static_array<index_t, n_dim> compute_all_strides()
+    {
+        static_array<index_t, n_dim> result{};
+        if constexpr(n_dim > 0)
+        {
+            result[n_dim - 1] = 1;
+            for(index_t i = n_dim - 1; i > 0; --i)
+            {
+                result[i - 1] = result[i] * lengths[i];
+            }
+        }
+        return result;
     }
 
-    // F signature: F(sequence<...>)
-    // CurrentOrderedId: sequence<...>
-    template <class F, class CurrentOrderedId>
-    CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentOrderedId) const
+    static constexpr static_array<index_t, n_dim> strides = compute_all_strides();
+
+    // Compile-time decomposition: linear index -> sequence of per-dimension indices
+    template <index_t LinearIdx>
+    using decompose = sequence<((LinearIdx / strides[Is]) % lengths[Is])...>;
+
+    // Decompose AND reorder in one step using a pre-computed inverse permutation.
+    // Produces the unordered multi-index directly, avoiding per-iteration
+    // reorder_old_to_new member function instantiations on each unique sequence type.
+    template <index_t LinearIdx, class New2Old>
+    using decompose_reordered = sequence<((LinearIdx / strides[inverse_perm<New2Old>::value[Is]]) %
+                                          lengths[inverse_perm<New2Old>::value[Is]])...>;
+};
+
+// Calls f(decompose<I>{}) for each linear index I in the pack, using a single
+// fold expression. Bypasses the static_for lambda entirely, eliminating M*N
+// intermediate lambda closure instantiations that the lambda-based approach creates.
+template <class Decomposer, class LinearIdxSeq>
+struct ford_applier;
+
+template <class Decomposer, index_t... LinearIds>
+struct ford_applier<Decomposer, sequence<LinearIds...>>
+{
+    template <class F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
     {
-        static_for<0, RemainLengths::front(), 1>{}([=](auto I) {
-            static_ford_impl<decltype(RemainLengths::pop_front()), Orders>{}(
-                f, CurrentOrderedId::push_back(I));
-        });
+        if constexpr(sizeof...(LinearIds) > 0)
+        {
+            (f(typename Decomposer::template decompose<LinearIds>{}), ...);
+        }
     }
 };
 
-template <class Orders>
-struct static_ford_impl<sequence<>, Orders>
+// Same as ford_applier but applies reordering during decomposition.
+template <class Decomposer, class New2Old, class LinearIdxSeq>
+struct ford_applier_reordered;
+
+template <class Decomposer, class New2Old, index_t... LinearIds>
+struct ford_applier_reordered<Decomposer, New2Old, sequence<LinearIds...>>
 {
-    // F signature: F(sequence<...>)
-    // OrderedId: sequence<...>
-    template <class F, class OrderedId>
-    CK_TILE_HOST_DEVICE constexpr void operator()(F f, OrderedId) const
+    template <class F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
     {
-        // retrive unordered Id
-        f(OrderedId::reorder_old_to_new(Orders{}));
+        if constexpr(sizeof...(LinearIds) > 0)
+        {
+            (f(typename Decomposer::template decompose_reordered<LinearIds, New2Old>{}), ...);
+        }
     }
 };
 
 } // namespace detail
 
-// Lengths is sequence<...>, it is the length of each dimension for
-// N-dimensional loop
-// Orders is sequence<...>, it is the order of dimension in which static_ford
-// will loop over each
-// dimension
+// Compile-time N-dimensional loop with static multi-indices.
+// Uses direct fold expansion with index decomposition, producing zero
+// intermediate lambda closures. Each iteration calls f with a compile-time
+// sequence<i0, i1, ...> containing the multi-dimensional index.
 template <class Lengths,
           class Orders = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
 struct static_ford
 {
+    static constexpr index_t n_dim = Lengths::size();
+    static constexpr index_t total_size =
+        reduce_on_sequence(Lengths{}, multiplies<>{}, number<1>{});
+
+    static constexpr bool is_identity_order = std::is_same_v<Orders, make_index_sequence<n_dim>>;
+
+    // For identity order, OrderedLengths == Lengths (no reorder needed).
+    // For non-identity, reorder lengths according to iteration order.
+    // Both branches must be valid types, but only the active one is used.
+    using OrderedLengths =
+        std::conditional_t<is_identity_order,
+                           Lengths,
+                           remove_cvref_t<decltype(Lengths::reorder_new_to_old(Orders{}))>>;
+    using Decomposer = detail::index_decomposer<OrderedLengths, make_index_sequence<n_dim>>;
+
     CK_TILE_HOST_DEVICE constexpr static_ford()
     {
         static_assert(Lengths::size() > 0, "wrong! Lengths is empty");
         static_assert(Lengths::size() == Orders::size(), "wrong! inconsistent size");
     }
 
-    // F signature: F(sequence<...> multi_id)
-    // multi_id is the unordered multi-index
     template <class F>
     CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
     {
-        constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{});
-        detail::static_ford_impl<decltype(ordered_lengths), Orders>{}(f, sequence<>{});
+        if constexpr(is_identity_order)
+        {
+            detail::ford_applier<Decomposer, make_index_sequence<total_size>>{}(f);
+        }
+        else
+        {
+            detail::ford_applier_reordered<Decomposer, Orders, make_index_sequence<total_size>>{}(
+                f);
+        }
     }
 };
 
diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp
index c73f76dd39..d02e327457 100644
--- a/include/ck_tile/core/utility/reduce_operator.hpp
+++ b/include/ck_tile/core/utility/reduce_operator.hpp
@@ -103,6 +103,42 @@ struct Max
     }
 };
 
+struct Min
+{
+    template <
+        typename T,
+        typename = std::enable_if_t<
+            is_any_of<T, float, double, int32_t, int8_t, half_t, bf16_t, fp8_t, bf8_t>::value>>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return numeric<T>::max();
+    };
+
+    template <
+        typename T,
+        typename = std::enable_if_t<
+            is_any_of<T, float, double, int32_t, int8_t, half_t, bf16_t, fp8_t, bf8_t>::value>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return min(y, x);
+    }
+
+    // Overload with changed flag for index tracking
+    template <
+        typename T,
+        typename = std::enable_if_t<
+            is_any_of<T, float, double, int32_t, int8_t, half_t, bf16_t, fp8_t, bf8_t>::value>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x, bool& changed) const
+    {
+        T new_min = min(y, x);
+        if(x < y)
+        {
+            changed = true;
+        }
+        return new_min;
+    }
+};
+
 struct AbsMax
 {
     template <
diff --git a/include/ck_tile/host/check_err.hpp b/include/ck_tile/host/check_err.hpp
index 9e49154b34..458a725379 100644
--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
@@ -61,7 +61,6 @@ CK_TILE_HOST double get_relative_threshold(const int number_of_accumulations = 1
                             tf32_t,
                             pk_fp4_t,
                             pk_fp4_raw_t,
-                            pk_fp6x16_t,
                             pk_int4_t,
                             I8,
                             I32,
@@ -136,7 +135,6 @@ CK_TILE_HOST double get_absolute_threshold(const double max_possible_num,
                             tf32_t,
                             pk_fp4_t,
                             pk_fp4_raw_t,
-                            pk_fp6x16_t,
                             pk_int4_t,
                             I8,
                             I32,
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index 44d1913033..bddc0ae2d2 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -169,41 +169,6 @@ struct FillUniformDistribution<ck_tile::pk_int4_t>
     }
 };
 
-template <>
-struct FillUniformDistribution<ck_tile::pk_fp6x16_t>
-{
-    float a_{-2.f};
-    float b_{2.f};
-    std::optional<uint32_t> seed_{11939};
-
-    template <typename ForwardIter>
-    void operator()(ForwardIter first, ForwardIter last) const
-    {
-        std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
-        std::uniform_real_distribution<float> dis(a_, b_);
-        while(first != last)
-        {
-            ck_tile::pk_fp6x16_t pk{};
-            for(ck_tile::index_t i = 0; i < ck_tile::pk_fp6x16_t::packed_size; ++i)
-            {
-                pk.pack(ck_tile::pk_fp6x16_t::float_to_fp6_e2m3(dis(gen)), i);
-            }
-            *first = pk;
-            ++first;
-        }
-    }
-
-    template <typename ForwardRange>
-    auto operator()(ForwardRange&& range) const
-        -> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
-            std::begin(std::forward<ForwardRange>(range)),
-            std::end(std::forward<ForwardRange>(range))))>
-    {
-        (*this)(std::begin(std::forward<ForwardRange>(range)),
-                std::end(std::forward<ForwardRange>(range)));
-    }
-};
-
 namespace impl {
 
 // clang-format off
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 6b7bf1b653..c96a427db1 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -15,6 +15,9 @@
 #include <cstddef>
 #include <hip/hip_runtime.h>
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
 namespace ck_tile {
 
 template <typename T, typename = void>
@@ -24,6 +27,8 @@ inline constexpr bool
     kattr_no_packed_fp32_ops_v<T, std::void_t<decltype(T::kattr_no_packed_fp32_ops)>> =
         T::kattr_no_packed_fp32_ops;
 
+// TODO: rename to something more specific (e.g. kernel_attr_no_packed_fp32) since
+// kernel_attr<bool> only controls the no-packed-fp32-ops flag, not a general attribute bag.
 template <bool no_packed_fp32_ops>
 struct kernel_attr
 {
@@ -32,6 +37,32 @@ struct kernel_attr
     static constexpr bool kattr_no_packed_fp32_ops = no_packed_fp32_ops;
 };
 
+// Compose an architecture tag with kernel attributes.
+// Inherits ArchTag for symbol mangling and adds attribute flags.
+// kernel_attr_for<gfx950_t>              -> gfx950_t  (identity)
+// kernel_attr_for<gfx950_t, kernel_attr<true>> -> unique type with attribute
+namespace detail {
+template <typename ArchTag, typename... Attrs>
+struct kernel_attr_for_impl : ArchTag, Attrs...
+{
+};
+
+template <typename ArchTag, typename... Attrs>
+struct kernel_attr_for_helper
+{
+    using type = kernel_attr_for_impl<ArchTag, Attrs...>;
+};
+
+template <typename ArchTag>
+struct kernel_attr_for_helper<ArchTag>
+{
+    using type = ArchTag;
+};
+} // namespace detail
+
+template <typename ArchTag, typename... Attrs>
+using kernel_attr_for = typename detail::kernel_attr_for_helper<ArchTag, Attrs...>::type;
+
 #if CK_TILE_USE_LAUNCH_BOUNDS
 #define KENTRY_LAUNCH_BOUNDS __launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
 #else
@@ -84,8 +115,11 @@ template <int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU,
           typename Attr     = void,
           typename KernelImpl,
           typename... Args>
-CK_TILE_HOST auto
-make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
+CK_TILE_HOST auto make_kernel(KernelImpl /*f*/,
+                              dim3 grid_dim,
+                              dim3 block_dim,
+                              std::size_t lds_byte,
+                              [[clang::lifetimebound]] Args... args)
 {
     const auto kernel = []() {
         if constexpr(std::is_void_v<Attr>)
@@ -303,3 +337,5 @@ CK_TILE_HOST float launch_kernel_time_mask_flush_cache(const stream_config& s,
 }
 
 } // namespace ck_tile
+
+#pragma clang diagnostic pop
diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp
index 80e933afb3..d291244aa4 100644
--- a/include/ck_tile/ops/common/streamk_common.hpp
+++ b/include/ck_tile/ops/common/streamk_common.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_coherency.hpp"
 
 namespace ck_tile {
 enum StreamKReductionStrategy : uint32_t
@@ -12,4 +13,277 @@ enum StreamKReductionStrategy : uint32_t
     Linear = 1u,
     Tree   = 2u
 };
+
+/// @brief StreamK reduction helpers: partial store/load, flag signaling, and tile accumulation.
+///        Shared by StreamK GEMM and StreamK conv bwd weight kernels.
+template <typename TilePartitioner_, typename GemmPipeline_, typename KernelArgs_>
+struct StreamKReductionOps
+{
+    using TilePartitioner = remove_cvref_t<TilePartitioner_>;
+    using BlockGemm       = typename GemmPipeline_::BlockGemm;
+    using WarpGemm        = typename BlockGemm::WarpGemm;
+    using BlockGemmShape  = typename GemmPipeline_::BlockGemmShape;
+
+    /**
+     *@brief Signals that the current thread block(CTA) has completed storing its partial
+     * results.
+     * @param kargs Kernel arguments, including the workspace pointer.
+     * @param cta_idx The index of the current thread block (CTA).
+     * @note This function utilizes a scalar store to write to the flags buffer.
+     */
+    CK_TILE_DEVICE void SignalStorePartialDone(const KernelArgs_& kargs, index_t cta_idx) const
+    {
+        auto* sk_flags_ptr = static_cast<index_t*>(kargs.workspace_ptr);
+        index_t offset     = cta_idx * sizeof(index_t);
+
+        // Depending on the architecture, the GLC flag will bypass the appropriate
+        // cache level(s) to ensure the write is visible to other workgroups. See the
+        // appropriate ISA for details about the GLC modifier.
+        asm volatile("s_store_dword %0, %1, %2 glc\n\t"
+                     "s_waitcnt lgkmcnt(0)" // Wait for the store to complete
+                     :
+                     : "s"(1), "s"(sk_flags_ptr), "s"(offset)
+                     : "memory");
+    }
+
+    /**
+     * @brief Waits for the thread block (cta_idx) to complete storing its partial results.
+     * @param kargs Kernel arguments, including the workspace pointer.
+     * @param cta_idx The index of the thread block (CTA).
+     * @note This function utilizes a scalar load to read from the flags
+     * buffer.
+     */
+    CK_TILE_DEVICE void WaitStorePartialDone(const KernelArgs_& kargs, index_t cta_idx) const
+    {
+        auto* sk_flags_ptr = static_cast<index_t*>(kargs.workspace_ptr);
+        index_t result;
+        index_t offset = cta_idx * sizeof(index_t);
+
+        do
+        {
+            // Depending on the architecture, the GLC flag will bypass the
+            // appropriate cache level(s) to avoid reading stale flags. See the
+            // appropriate ISA for details about the GLC modifier.
+            asm volatile("s_load_dword %0, %1, %2 glc\n\t"
+                         "s_waitcnt lgkmcnt(0)" // Wait for the load to complete
+                         : "=s"(result)
+                         : "s"(sk_flags_ptr), "s"(offset)
+                         : "memory");
+        } while(result != 1);
+    }
+
+    /**
+     * @brief Adds the values of a block tile to an output block tile.
+     * @param in_out_block_tile The output block tile to which values are added.
+     * @param in_block_tile The input block tile whose values are added.
+     * @note This function iterates over the distributed spans of the block tiles and updates
+     * the output block tile with accumulated values.
+     */
+    template <typename OAccTile>
+    CK_TILE_DEVICE void AddBlockTile(OAccTile& in_out_block_tile,
+                                     const OAccTile& in_block_tile) const
+    {
+        using BlockType        = remove_cvref_t<decltype(in_out_block_tile)>;
+        constexpr auto o_spans = BlockType::get_distributed_spans();
+        sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+            sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                constexpr auto idx     = make_tuple(idx0, idx1);
+                in_out_block_tile(idx) = in_out_block_tile[idx] + in_block_tile[idx];
+            });
+        });
+    }
+
+    /**
+     * @brief Loads a partial block tile from the workspace buffer.
+     * @param kargs Kernel arguments, including the workspace pointer.
+     * @param cta_idx The index of the thread block (CTA).
+     * @param c_block_tile_dist The tile distribution for the block.
+     * @return The loaded partial block tile.
+     * @note This function calculates the buffer pointer and uses the tile distribution for
+     * loading the partial block tile.
+     */
+    template <typename DataType, typename OAccTileDist>
+    CK_TILE_DEVICE auto LoadPartial(const KernelArgs_& kargs,
+                                    index_t cta_idx,
+                                    const OAccTileDist& c_block_tile_dist) const
+    {
+        const auto c_block_tile_buffer_size =
+            TilePartitioner::MPerBlock * TilePartitioner::NPerBlock * sizeof(DataType);
+        void* partial_buffer_ptr = static_cast<char*>(kargs.workspace_ptr) +
+                                   kargs.tile_partitioner.get_flags_buffer_size() +
+                                   cta_idx * c_block_tile_buffer_size;
+
+        const auto& partial_tensor_view = make_naive_tensor_view<address_space_enum::global>(
+            static_cast<DataType*>(partial_buffer_ptr),
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            make_tuple(TilePartitioner::NPerBlock, 1),
+            number<GetVectorSizePartials()>{},
+            number<1>{});
+
+        auto partial_tile_window = make_tile_window(
+            partial_tensor_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {0, 0},
+            MakePartialsDistribution());
+
+        auto partials_tile = load_tile(partial_tile_window);
+
+        // Since the partials distribution is not the same as the C block distribution, we must
+        // describe the contents in the partials tile with the C block distribution.
+        // Note: The data assigned to threads does not change between distributions.
+        auto partials_tile_with_c_distr = make_static_distributed_tensor<DataType>(
+            c_block_tile_dist, partials_tile.get_thread_buffer());
+
+        return partials_tile_with_c_distr;
+    }
+
+    /**
+     * @brief Returns the vector size to be used for reading from and writing to partials.
+     * @return The vector size
+     */
+    CK_TILE_DEVICE static constexpr index_t GetVectorSizePartials()
+    {
+        // We use kCM1PerLane from the C register layout of the warp GEMM which corresponds to the
+        // maximum vector width
+        return WarpGemm::WarpGemmAttribute::Impl::kCM1PerLane;
+    }
+
+    /**
+     * @brief Returns distribution used for reading from and writing to partials.
+     * @return The distribution.
+     * @note This will result in optimized reads from and writes to partials when C is row major.
+     * Additional functionality should be added to ensure optimized accesses to partials when C is
+     * column major. Since the C-Shuffle epilogue only supports C as row major, this is not a
+     * current limitation.
+     */
+    CK_TILE_DEVICE static constexpr auto MakePartialsDistribution()
+    {
+        // Create the encoding to describe waves within a block
+        constexpr index_t m_warp = BlockGemmShape::BlockWarps::at(number<0>{});
+        constexpr index_t n_warp = BlockGemmShape::BlockWarps::at(number<1>{});
+
+        constexpr index_t m_iter_per_warp = TilePartitioner::MPerBlock / (m_warp * WarpGemm::kM);
+        constexpr index_t n_iter_per_warp = TilePartitioner::NPerBlock / (n_warp * WarpGemm::kN);
+
+        constexpr auto partials_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<m_iter_per_warp, m_warp>, sequence<n_iter_per_warp, n_warp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        // Create the encoding to describe threads within a wave
+        constexpr index_t vector_size         = GetVectorSizePartials();
+        constexpr index_t m_warp_repeat       = WarpGemm::WarpGemmAttribute::Impl::kCM0PerLane;
+        constexpr index_t warp_tile_n_threads = WarpGemm::kN / vector_size;
+        constexpr index_t warp_tile_m_threads = get_warp_size() / warp_tile_n_threads;
+
+        // This inner encoding ensures that contiguous threads perform vectorized writes along the
+        // same row in C.
+        constexpr auto partials_inner_dstr_encoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<m_warp_repeat, warp_tile_m_threads>,
+                                             sequence<warp_tile_n_threads, vector_size>>,
+                                       tuple<sequence<1, 2>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{};
+
+        // Combine the outer and inner encoding
+        constexpr auto partials_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            partials_outer_dstr_encoding, partials_inner_dstr_encoding);
+
+        return make_static_tile_distribution(partials_dstr_encode);
+    }
+
+    /**
+     * @brief Stores a partial block tile to the workspace buffer.
+     * @param kargs Kernel arguments, including the workspace pointer.
+     * @param cta_idx The index of the thread block (CTA).
+     * @param c_block_tile The block tile to be stored.
+     * @note This function calculates the buffer pointer and uses the tile window for storing
+     * the partial block tile.
+     */
+    template <typename OAccTile>
+    CK_TILE_DEVICE void
+    StorePartial(const KernelArgs_& kargs, index_t cta_idx, const OAccTile& c_block_tile) const
+    {
+        const auto c_block_tile_buffer_size = TilePartitioner::MPerBlock *
+                                              TilePartitioner::NPerBlock *
+                                              sizeof(typename OAccTile::DataType);
+        void* partial_buffer_ptr = static_cast<char*>(kargs.workspace_ptr) +
+                                   kargs.tile_partitioner.get_flags_buffer_size() +
+                                   cta_idx * c_block_tile_buffer_size;
+
+        const auto& partial_tensor_view = make_naive_tensor_view<
+            address_space_enum::global,
+            memory_operation_enum::set,
+            StreamKCoherency<decltype(core::arch::get_compiler_target())>::BUFFER_COHERENCE>(
+            static_cast<typename OAccTile::DataType*>(partial_buffer_ptr),
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            make_tuple(TilePartitioner::NPerBlock, 1),
+            number<GetVectorSizePartials()>{},
+            number<1>{});
+
+        auto partial_tile_window = make_tile_window(
+            partial_tensor_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {0, 0},
+            MakePartialsDistribution());
+
+        // Since the C block distribution is not the same as the partials distribution, we must
+        // describe the contents in the c_block_tile with the partials distribution.
+        // Note: The data assigned to threads does not change between distributions.
+        auto c_with_partials_dist = make_static_distributed_tensor<typename OAccTile::DataType>(
+            MakePartialsDistribution(), c_block_tile.get_thread_buffer());
+
+        store_tile(partial_tile_window, c_with_partials_dist);
+        // Wait for all vector stores for this wavefront to complete
+        s_waitcnt</*vmcnt*/ 0, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
+        // Wait for all wavefronts in this workgroup to arrive here before continuing
+        __builtin_amdgcn_s_barrier();
+    }
+};
+
+/// @brief StreamK data-parallel (DP) dispatch: handles persistent vs non-persistent DP,
+///        then delegates to the Stream-K loop. Shared by GEMM and Conv StreamK kernels.
+///
+/// Non-persistent: launches dp_ctas + sk_ctas workgroups. DP workgroups each process
+///   one full tile; SK workgroups share the remaining tiles' K-iterations.
+/// Persistent: launches num_cu * occupancy workgroups. Each loops over DP tiles
+///   (round-robin), then proceeds to SK work.
+///
+/// @tparam TilePartitioner_ Partitioner type (persistent or non-persistent specialization).
+/// @param tile_partitioner The partitioner instance from kernel args.
+/// @param dp_tile_func     Callable(index_t tile_idx) — processes one full DP tile.
+/// @param sk_func          Callable(index_t sk_cta_idx) — runs the StreamK loop for this CTA.
+template <typename TilePartitioner_, typename DPTileFunc, typename SKFunc>
+CK_TILE_DEVICE void
+StreamKDispatch(const TilePartitioner_& tile_partitioner, DPTileFunc dp_tile_func, SKFunc sk_func)
+{
+    const index_t block_idx = get_block_1d_id();
+
+    if constexpr(TilePartitioner_::PERSISTENT)
+    {
+        // Persistent: each workgroup loops over multiple DP tiles, then does SK work
+        for(index_t tile_idx = block_idx; tile_idx < tile_partitioner.get_dp_tiles();
+            tile_idx += tile_partitioner.get_max_active_wgs())
+        {
+            dp_tile_func(tile_idx);
+            block_sync_lds();
+        }
+        sk_func(block_idx);
+    }
+    else
+    {
+        // Non-persistent: dedicated DP workgroups, then dedicated SK workgroups
+        const index_t dp_ctas = tile_partitioner.get_dp_ctas();
+        if(block_idx < dp_ctas)
+            dp_tile_func(block_idx);
+        else
+            sk_func(block_idx - dp_ctas);
+    }
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index 4ad699629c..4e971649d0 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -745,14 +745,6 @@ struct PassThroughPack2
     template <typename Y, typename X>
     CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
 
-#if 0
-    CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::fp16x2_t& y, const ck_tile::f8x2_t& x) const
-    {
-        auto t = type_convert<float2_t>(x);
-        y      = type_convert<fp16x2_t>(t);
-    }
-#endif
-
     CK_TILE_HOST_DEVICE constexpr void operator()(fp16x2_t& y, const pk_int4_t& x) const
     {
         uint8_t x_u8 = bit_cast<uint8_t>(x);
@@ -871,61 +863,6 @@ struct UnaryConvert
     }
 };
 
-#if 0
-struct ConvertBF16RTN
-{
-    // convert to bf16 using round to nearest (rtn)
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
-    {
-        // check Y datatype
-        static_assert(std::is_same_v<Y, ck_tile::bf16_t>, "Data type is not supported by this operation!");
-
-        // check X datatype
-        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
-                      "Data type is not supported by this operation!");
-
-        y = bf16_convert_rtn<Y>(x);
-    }
-};
-
-struct ConvertF8SR
-{
-    // convert to fp8 using stochastic rounding (SR)
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
-    {
-        // check Y datatype
-        static_assert(std::is_same_v<Y, ck_tile::fp8_t> || std::is_same_v<Y, ck_tile::bf8_t>,
-                      "Data type is not supported by this operation!");
-
-        // check X datatype
-        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
-                      "Data type is not supported by this operation!");
-
-        y = f8_convert_sr<Y>(x);
-    }
-};
-
-struct ConvertF8RNE
-{
-    // convert to fp8 using rounding to nearest even
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
-    {
-        // check Y datatype
-        static_assert(std::is_same_v<Y, ck_tile::fp8_t> || std::is_same_v<Y, ck_tile::bf8_t>,
-                      "Data type is not supported by this operation!");
-
-        // check X datatype
-        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
-                      "Data type is not supported by this operation!");
-
-        y = f8_convert_rne<Y>(x);
-    }
-};
-#endif
-
 struct Scale
 {
     static constexpr const char* name = "Scale";
diff --git a/include/ck_tile/ops/epilogue/chainer/epilogue_chainer.hpp b/include/ck_tile/ops/epilogue/chainer/epilogue_chainer.hpp
index 25ef000cc3..f22919d922 100644
--- a/include/ck_tile/ops/epilogue/chainer/epilogue_chainer.hpp
+++ b/include/ck_tile/ops/epilogue/chainer/epilogue_chainer.hpp
@@ -187,11 +187,11 @@ struct EpilogueGraph
                                    Context& context) const
     {
         // For each iteration, process all epilogues in order
-        static_for<0, Steps, 1>{}([&](auto iAccess) {
-            static_for<0, sizeof...(EpilogueTypes), 1>{}([&](auto I) {
-                epilogues.template get<I.value>()(
-                    out_window, acc_tile, aux_windows, p_smem, context, iAccess);
-            });
+        static_ford<sequence<Steps, sizeof...(EpilogueTypes)>>{}([&](auto iI) {
+            constexpr auto iAccess = number<iI[number<0>{}]>{};
+            constexpr auto I       = number<iI[number<1>{}]>{};
+            epilogues.template get<I.value>()(
+                out_window, acc_tile, aux_windows, p_smem, context, iAccess);
         });
     }
 };
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 3639c811fd..fba831e205 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -129,7 +129,13 @@ struct CShuffleEpilogue
     static constexpr index_t isCTransposed = Problem::isCTransposed;
     static constexpr bool FixedVectorSize  = Problem::FixedVectorSize;
     static constexpr bool TiledMMAPermuteN = Problem::TiledMMAPermuteN;
-    static constexpr bool EightWave        = (MWave * NWave == 8);
+
+#if defined(__gfx9__)
+    static constexpr bool EightWave = (MWave * NWave == 8);
+#else
+    static constexpr bool EightWave = false;
+#endif
+
     static constexpr index_t BlockedXDLN_PerWarp =
         EightWave ? kNPerBlock / NWave / NPerXdl : Problem::BlockedXDLN_PerWarp;
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
@@ -336,9 +342,6 @@ struct CShuffleEpilogue
             constexpr index_t BaseWords  = ToWords(BaseStrideElems);
             constexpr index_t PadWords   = ((BaseWords % 2) == 0) ? 1 : 0;
             constexpr auto PaddingAmount = PadWords * ElemsPer4B;
-#else
-            constexpr auto PaddingAmount = 0;
-#endif
 
             constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
                 make_tuple(number<MPerIterationShuffle / MLdsLayer>{},
@@ -369,6 +372,18 @@ struct CShuffleEpilogue
                 make_tuple(sequence<0>{}, sequence<1>{}));
 
             return lds_block_desc;
+
+#else
+            constexpr auto PaddingAmount = 0;
+
+            constexpr auto lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+                make_tuple(number<NPerIterationShuffle + PaddingAmount>{}, number<1>{}),
+                number<VectorLen>{},
+                number<1>{});
+
+            return lds_block_desc;
+#endif
         }
         // M is contiguous dimension
         else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
diff --git a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
index 2b8e9e4b1a..de73e4f1ff 100644
--- a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
@@ -92,29 +92,29 @@ struct BlockFlatmmASmemBSmemCRegV1
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                // read A warp tensor from A block window
-                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+        static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+            constexpr auto kIter = number<km[number<0>{}]>{};
+            constexpr auto mIter = number<km[number<1>{}]>{};
+            // read A warp tensor from A block window
+            const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
 
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read C warp tensor from C block tensor
-                    CWarpTensor c_warp_tensor;
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor(nIter)(kIter));
+                // warp GEMM
+                WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor(nIter)(kIter));
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                    __builtin_amdgcn_sched_barrier(0x7F6);
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
+                __builtin_amdgcn_sched_barrier(0x7F6);
             });
         });
     }
diff --git a/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp
index ae33137459..ff96139f18 100644
--- a/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp
@@ -339,16 +339,6 @@ struct GroupedFlatmmKernel : FlatmmKernel<TilePartitioner_, FlatmmPipeline_, Epi
     {
         return hostArgs;
     }
-    // CK_TILE_HOST static constexpr auto
-    // MakeKernelArgs(const ContiguousGroupedFlatmmHostArgs& hostArgs)
-    // {
-    //     return hostArgs;
-    // }
-    // CK_TILE_HOST static constexpr auto
-    // MakeKernelArgs(const MaskedGroupedFlatmmHostArgs& hostArgs)
-    // {
-    //     return hostArgs;
-    // }
 
     template <class ScaleM       = FlatmmScalePointer<-1>,
               class ScaleN       = FlatmmScalePointer<-1>,
diff --git a/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp
index 13d5e65155..6721577018 100644
--- a/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp
@@ -483,13 +483,6 @@ struct MoeFlatmmKernel
 
         if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
         {
-            // if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
-            // {
-            //     std::cerr << "Can't support N that is not a multiple of NPerBlock"
-            //                  " without padding!"
-            //               << std::endl;
-            //     return false;
-            // }
             if(kargs.N % FlatmmPipeline::GetVectorSizeB() != 0)
             {
                 std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl;
@@ -1105,15 +1098,14 @@ struct MoeFlatmmKernel
             statically_indexed_array<index_t, ScaleMRepeat> scale_m_offsets;
 
             if constexpr(!BMXFP4_Pipeline)
-                static_for<0, MRepeat, 1>{}([&](auto mIter) {
-                    static_for<0, kM0, 1>{}([&](auto m0) {
-                        static_for<0, kM2, 1>{}([&](auto m2) {
-                            const auto row_idx =
-                                coord_m + mIter * MPerXdl + m0 * kM1 * kM2 + m2 + scale_m_coord[I0];
-                            scale_m_offsets[mIter * number<kM0 * kM2>{} + m0 * number<kM2>{} + m2] =
-                                row_to_token_idx(row_idx);
-                        });
-                    });
+                static_ford<sequence<MRepeat, kM0, kM2>>{}([&](auto mmm) {
+                    constexpr auto mIter = number<mmm[number<0>{}]>{};
+                    constexpr auto m0    = number<mmm[number<1>{}]>{};
+                    constexpr auto m2    = number<mmm[number<2>{}]>{};
+                    const auto row_idx =
+                        coord_m + mIter * MPerXdl + m0 * kM1 * kM2 + m2 + scale_m_coord[I0];
+                    scale_m_offsets[mIter * number<kM0 * kM2>{} + m0 * number<kM2>{} + m2] =
+                        row_to_token_idx(row_idx);
                 });
 
             constexpr int DynamicTileOffsetFlag = 0;
@@ -1426,19 +1418,19 @@ struct MoeFlatmmKernel
             statically_indexed_array<statically_indexed_array<bool, MPerThread>, NumMEpiTile>
                 c_scatter_valids;
             auto c_coord = dram_tile_distribution.calculate_index();
-            static_for<0, NumMEpiTile, 1>{}([&](auto mIter) {
-                static_for<0, MPerThread, 1>{}([&](auto m0) {
-                    auto row_idx = coord_m + mIter * MPerIterationShuffle + c_coord[0] + m0;
-                    auto fused_token =
-                        kargs.p_sorted_token_ids[row_idx]; // topk-idx[31:24] + token_idx[23:0]
+            static_ford<sequence<NumMEpiTile, MPerThread>>{}([&](auto mm) {
+                constexpr auto mIter = number<mm[number<0>{}]>{};
+                constexpr auto m0    = number<mm[number<1>{}]>{};
+                auto row_idx         = coord_m + mIter * MPerIterationShuffle + c_coord[0] + m0;
+                auto fused_token =
+                    kargs.p_sorted_token_ids[row_idx]; // topk-idx[31:24] + token_idx[23:0]
 
-                    index_t scatter_token_id    = fused_token & token_id_mask;
-                    c_scatter_valids[mIter][m0] = (scatter_token_id < kargs.NumTokens);
-                    if constexpr(IsInputGemm)
-                        scatter_token_id =
-                            scatter_token_id * kargs.TopK + (fused_token >> token_id_offset);
-                    c_scatter_offsets[mIter][m0] = scatter_token_id * kargs.stride_C;
-                });
+                index_t scatter_token_id    = fused_token & token_id_mask;
+                c_scatter_valids[mIter][m0] = (scatter_token_id < kargs.NumTokens);
+                if constexpr(IsInputGemm)
+                    scatter_token_id =
+                        scatter_token_id * kargs.TopK + (fused_token >> token_id_offset);
+                c_scatter_offsets[mIter][m0] = scatter_token_id * kargs.stride_C;
             });
 
             //===----------------------------------------------------------------------===//
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index ee8527c458..8f40c9be7a 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -606,16 +606,16 @@ defined(USING_MFMA_32x32x64) && defined(ENABLE_FP4) // mi350 fp4 32c 1*K1
             MIterPerWarp>
             a_warp_windows_pong;
 
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
-                a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter              = number<mk[number<0>{}]>{};
+            constexpr auto kIter              = number<mk[number<1>{}]>{};
+            a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
+            a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
 
-                move_tile_window(a_warp_windows_ping(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-                move_tile_window(a_warp_windows_pong(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(a_warp_windows_ping(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            move_tile_window(a_warp_windows_pong(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 
         // Block GEMM
@@ -656,15 +656,15 @@ defined(USING_MFMA_32x32x64) && defined(ENABLE_FP4) // mi350 fp4 32c 1*K1
         move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
         // prefetch B
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+        static_ford<sequence<NIterPerWarp, KIterPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter              = number<nk[number<0>{}]>{};
+            constexpr auto kIter              = number<nk[number<1>{}]>{};
+            b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+            move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                             {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
-            });
+            b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
         });
         // move B window to next flat K
         move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
@@ -701,15 +701,15 @@ defined(USING_MFMA_32x32x64) && defined(ENABLE_FP4) // mi350 fp4 32c 1*K1
         while(iCounter > 0)
         {
             // prefetch B(2i+1)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
-                });
+                b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
             });
 
             // Prefill A(2i+1)
@@ -722,44 +722,44 @@ defined(USING_MFMA_32x32x64) && defined(ENABLE_FP4) // mi350 fp4 32c 1*K1
             move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
             // GEMM 2i
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_ping(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_ping(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
 
             // move B window to next flat K
@@ -776,15 +776,15 @@ defined(USING_MFMA_32x32x64) && defined(ENABLE_FP4) // mi350 fp4 32c 1*K1
             // Next K
 
             // prefetch B(2i+2)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                    b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
-                });
+                b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
             });
 
             // Prefill A(2i+2)
@@ -797,43 +797,43 @@ defined(USING_MFMA_32x32x64) && defined(ENABLE_FP4) // mi350 fp4 32c 1*K1
             move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
             // GEMM 2i+1
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_pong(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_pong(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
 
             // move B window to next flat K
@@ -854,15 +854,15 @@ defined(USING_MFMA_32x32x64) && defined(ENABLE_FP4) // mi350 fp4 32c 1*K1
         if constexpr(TailNum == TailNumber::Even)
         {
             // prefetch B(loopK)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
 
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
-                });
+                b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
             });
 
             // Prefill A(loopK)
@@ -870,44 +870,44 @@ defined(USING_MFMA_32x32x64) && defined(ENABLE_FP4) // mi350 fp4 32c 1*K1
             store_tile(a_copy_lds_window_pong, a_block_tile_tmp);
 
             // GEMM loopK-1
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_ping(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_ping(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
 
             static_for<0, m_preload, 1>{}([&](auto loadIter) {
@@ -920,86 +920,86 @@ defined(USING_MFMA_32x32x64) && defined(ENABLE_FP4) // mi350 fp4 32c 1*K1
             Last2ndHotLoopScheduler();
 
             // GEMM loopK
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_pong(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_pong(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
-                    }
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
+                }
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
             LastHotLoopScheduler();
         }
         else if constexpr(TailNum == TailNumber::Odd)
         {
             // GEMM loopK
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_ping(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_ping(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
             LastHotLoopScheduler();
         }
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 76d191a40c..99c35e9f30 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -392,10 +392,6 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         constexpr index_t M1 = BlockSize / get_warp_size();
         static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
         static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
-        // constexpr index_t M0 = MPerBlock / (M2 * M1);
-        // static_assert(M0 * M1 * M2 == MPerBlock,
-        //                 "Incorrect M0, M2, M1 configuration! "
-        //                 "M0, M1, M2 must cover whole MPerBlock!");
 
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 11b978813a..6e6547b837 100644
--- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -537,22 +537,22 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
             a_warp_windows_pong;
 
         auto A_Lds_Stride = 8;
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
-                a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter              = number<mk[number<0>{}]>{};
+            constexpr auto kIter              = number<mk[number<1>{}]>{};
+            a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
+            a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
 
-                auto weight_k_idx  = kIter / number<XDL_PerWeightK>{};
-                auto weight_k_rank = kIter % number<XDL_PerWeightK>{};
-                move_tile_window(
-                    a_warp_windows_ping(mIter)(kIter),
-                    {mIter * MPerBlockPerIter,
-                     weight_k_rank * A_Lds_Stride + weight_k_idx * XDL_PerWeightK * WG::kK});
-                move_tile_window(
-                    a_warp_windows_pong(mIter)(kIter),
-                    {mIter * MPerBlockPerIter,
-                     weight_k_rank * A_Lds_Stride + weight_k_idx * XDL_PerWeightK * WG::kK});
-            });
+            auto weight_k_idx  = kIter / number<XDL_PerWeightK>{};
+            auto weight_k_rank = kIter % number<XDL_PerWeightK>{};
+            move_tile_window(
+                a_warp_windows_ping(mIter)(kIter),
+                {mIter * MPerBlockPerIter,
+                 weight_k_rank * A_Lds_Stride + weight_k_idx * XDL_PerWeightK * WG::kK});
+            move_tile_window(
+                a_warp_windows_pong(mIter)(kIter),
+                {mIter * MPerBlockPerIter,
+                 weight_k_rank * A_Lds_Stride + weight_k_idx * XDL_PerWeightK * WG::kK});
         });
 
         // Block GEMM
@@ -657,33 +657,32 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
         move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
         // prefetch B
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, MXFP4KPerWarp, 1>{}([&](auto kIter) {
-                if constexpr(nIter % XDL_PerScaleN == 0 && kIter % MXFP4K_PerScaleK == 0)
-                {
-                    auto scale_n_iter = nIter / number<XDL_PerScaleN>{};
-                    auto scale_k_iter = kIter / number<MXFP4K_PerScaleK>{};
+        static_ford<sequence<NIterPerWarp, MXFP4KPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter = number<nk[number<0>{}]>{};
+            constexpr auto kIter = number<nk[number<1>{}]>{};
+            if constexpr(nIter % XDL_PerScaleN == 0 && kIter % MXFP4K_PerScaleK == 0)
+            {
+                auto scale_n_iter = nIter / number<XDL_PerScaleN>{};
+                auto scale_k_iter = kIter / number<MXFP4K_PerScaleK>{};
 
-                    scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter) =
-                        scale_b_flat_dram_window;
-                    move_tile_window(
-                        scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter),
-                        {scale_n_iter * NFlatPerBlockPerIter, scale_k_iter * ScaleKFlatPerWarp});
-                    scale_b_warp_tensor_ping(scale_n_iter)(scale_k_iter) =
-                        load_tile(scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter));
-                }
-                auto packed_n_idx  = nIter / number<ContinuousScaleNPerThread>{};
-                auto packed_n_rank = nIter % number<ContinuousScaleNPerThread>{};
+                scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter) = scale_b_flat_dram_window;
+                move_tile_window(
+                    scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter),
+                    {scale_n_iter * NFlatPerBlockPerIter, scale_k_iter * ScaleKFlatPerWarp});
+                scale_b_warp_tensor_ping(scale_n_iter)(scale_k_iter) =
+                    load_tile(scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter));
+            }
+            auto packed_n_idx  = nIter / number<ContinuousScaleNPerThread>{};
+            auto packed_n_rank = nIter % number<ContinuousScaleNPerThread>{};
 
-                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
-                move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                 {packed_n_idx * ContinuousScaleNPerThread * NFlatPerBlockPerIter +
-                                      packed_n_rank,
-                                  kIter * KFlatPerBlockPerIter});
+            b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            move_tile_window(
+                b_flat_dram_windows(nIter)(kIter),
+                {packed_n_idx * ContinuousScaleNPerThread * NFlatPerBlockPerIter + packed_n_rank,
+                 kIter * KFlatPerBlockPerIter});
 
-                ub.mxfp4                         = load_tile(b_flat_dram_windows(nIter)(kIter));
-                b_warp_tensor_ping(nIter)(kIter) = ub.u;
-            });
+            ub.mxfp4                         = load_tile(b_flat_dram_windows(nIter)(kIter));
+            b_warp_tensor_ping(nIter)(kIter) = ub.u;
         });
         // move B window to next flat K
         move_tile_window(b_flat_dram_window, {0, MXFP4KPerWarp * KFlatPerBlockPerIter});
@@ -794,38 +793,37 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
         while(iCounter > 0)
         {
             // prefetch B(2i+1)
-            static_for<0, MXFP4KPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    if constexpr(nIter % XDL_PerScaleN == 0 && kIter % MXFP4K_PerScaleK == 0)
-                    {
-                        auto scale_n_iter = nIter / number<XDL_PerScaleN>{};
-                        auto scale_k_iter = kIter / number<MXFP4K_PerScaleK>{};
+            static_ford<sequence<MXFP4KPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter = number<kn[number<0>{}]>{};
+                constexpr auto nIter = number<kn[number<1>{}]>{};
+                if constexpr(nIter % XDL_PerScaleN == 0 && kIter % MXFP4K_PerScaleK == 0)
+                {
+                    auto scale_n_iter = nIter / number<XDL_PerScaleN>{};
+                    auto scale_k_iter = kIter / number<MXFP4K_PerScaleK>{};
 
-                        scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter) =
-                            scale_b_flat_dram_window;
-
-                        move_tile_window(scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter),
-                                         {scale_n_iter * NFlatPerBlockPerIter,
-                                          scale_k_iter * ScaleKFlatPerWarp});
-
-                        scale_b_warp_tensor_pong(scale_n_iter)(scale_k_iter) =
-                            load_tile(scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter));
-                    }
-
-                    auto packed_n_idx  = nIter / number<ContinuousScaleNPerThread>{};
-                    auto packed_n_rank = nIter % number<ContinuousScaleNPerThread>{};
-
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+                    scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter) =
+                        scale_b_flat_dram_window;
 
                     move_tile_window(
-                        b_flat_dram_windows(nIter)(kIter),
-                        {packed_n_idx * ContinuousScaleNPerThread * NFlatPerBlockPerIter +
-                             packed_n_rank,
-                         kIter * KFlatPerBlockPerIter});
+                        scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter),
+                        {scale_n_iter * NFlatPerBlockPerIter, scale_k_iter * ScaleKFlatPerWarp});
 
-                    ub.mxfp4                         = load_tile(b_flat_dram_windows(nIter)(kIter));
-                    b_warp_tensor_pong(nIter)(kIter) = ub.u;
-                });
+                    scale_b_warp_tensor_pong(scale_n_iter)(scale_k_iter) =
+                        load_tile(scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter));
+                }
+
+                auto packed_n_idx  = nIter / number<ContinuousScaleNPerThread>{};
+                auto packed_n_rank = nIter % number<ContinuousScaleNPerThread>{};
+
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {packed_n_idx * ContinuousScaleNPerThread * NFlatPerBlockPerIter +
+                                      packed_n_rank,
+                                  kIter * KFlatPerBlockPerIter});
+
+                ub.mxfp4                         = load_tile(b_flat_dram_windows(nIter)(kIter));
+                b_warp_tensor_pong(nIter)(kIter) = ub.u;
             });
 
             // Prefill A(2i+1)
@@ -835,51 +833,50 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
             prefill_lds_a_stage1(
                 a_copy_lds_window_ping, a_copy_dram_window, number<PrefillBeforeGemm>{});
             // GEMM 2i
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        if constexpr(mIter == 0)
-                            dequant_mxfp4(
-                                b_warp_tensor_ping(nIter)(kIter / number<XDL_PerWeightK>{}),
-                                scale_b_warp_tensor_ping(nIter / number<XDL_PerScaleN>{})(
-                                    kIter / number<XDL_PerScaleK>{}),
-                                nIter,
-                                kIter);
+                    if constexpr(mIter == 0)
+                        dequant_mxfp4(b_warp_tensor_ping(nIter)(kIter / number<XDL_PerWeightK>{}),
+                                      scale_b_warp_tensor_ping(nIter / number<XDL_PerScaleN>{})(
+                                          kIter / number<XDL_PerScaleK>{}),
+                                      nIter,
+                                      kIter);
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor, a_warp_tensor(number<AwarpIter>{}), dequant_B_n[nIter]);
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor(number<AwarpIter>{}), dequant_B_n[nIter]);
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        __builtin_amdgcn_s_waitcnt(Bload_total_num);
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    __builtin_amdgcn_s_waitcnt(Bload_total_num);
+                    block_sync_lds();
+                }
             });
             prefill_lds_a_stage1(
                 a_copy_lds_window_ping, a_copy_dram_window, number<PrefillAfterGemm>{});
@@ -902,37 +899,36 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
             // Next K
 
             // prefetch B(2i+2)
-            static_for<0, MXFP4KPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    if constexpr(nIter % XDL_PerScaleN == 0 && kIter % MXFP4K_PerScaleK == 0)
-                    {
-                        auto scale_n_iter = nIter / number<XDL_PerScaleN>{};
-                        auto scale_k_iter = kIter / number<MXFP4K_PerScaleK>{};
+            static_ford<sequence<MXFP4KPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter = number<kn[number<0>{}]>{};
+                constexpr auto nIter = number<kn[number<1>{}]>{};
+                if constexpr(nIter % XDL_PerScaleN == 0 && kIter % MXFP4K_PerScaleK == 0)
+                {
+                    auto scale_n_iter = nIter / number<XDL_PerScaleN>{};
+                    auto scale_k_iter = kIter / number<MXFP4K_PerScaleK>{};
 
-                        scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter) =
-                            scale_b_flat_dram_window;
+                    scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter) =
+                        scale_b_flat_dram_window;
 
-                        move_tile_window(scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter),
-                                         {scale_n_iter * NFlatPerBlockPerIter,
-                                          scale_k_iter * ScaleKFlatPerWarp});
-
-                        scale_b_warp_tensor_ping(scale_n_iter)(scale_k_iter) =
-                            load_tile(scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter));
-                    }
-
-                    auto packed_n_idx  = nIter / number<ContinuousScaleNPerThread>{};
-                    auto packed_n_rank = nIter % number<ContinuousScaleNPerThread>{};
-
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
                     move_tile_window(
-                        b_flat_dram_windows(nIter)(kIter),
-                        {packed_n_idx * ContinuousScaleNPerThread * NFlatPerBlockPerIter +
-                             packed_n_rank,
-                         kIter * KFlatPerBlockPerIter});
+                        scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter),
+                        {scale_n_iter * NFlatPerBlockPerIter, scale_k_iter * ScaleKFlatPerWarp});
 
-                    ub.mxfp4                         = load_tile(b_flat_dram_windows(nIter)(kIter));
-                    b_warp_tensor_ping(nIter)(kIter) = ub.u;
-                });
+                    scale_b_warp_tensor_ping(scale_n_iter)(scale_k_iter) =
+                        load_tile(scale_b_flat_dram_windows(scale_n_iter)(scale_k_iter));
+                }
+
+                auto packed_n_idx  = nIter / number<ContinuousScaleNPerThread>{};
+                auto packed_n_rank = nIter % number<ContinuousScaleNPerThread>{};
+
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {packed_n_idx * ContinuousScaleNPerThread * NFlatPerBlockPerIter +
+                                      packed_n_rank,
+                                  kIter * KFlatPerBlockPerIter});
+
+                ub.mxfp4                         = load_tile(b_flat_dram_windows(nIter)(kIter));
+                b_warp_tensor_ping(nIter)(kIter) = ub.u;
             });
 
             // Prefill A(2i+2)
@@ -943,50 +939,49 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
                 a_copy_lds_window_pong, a_copy_dram_window, number<PrefillBeforeGemm>{});
 
             // GEMM 2i+1
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        if constexpr(mIter == 0)
-                            dequant_mxfp4(
-                                b_warp_tensor_pong(nIter)(kIter / number<XDL_PerWeightK>{}),
-                                scale_b_warp_tensor_pong(nIter / number<XDL_PerScaleN>{})(
-                                    kIter / number<XDL_PerScaleK>{}),
-                                nIter,
-                                kIter);
+                    if constexpr(mIter == 0)
+                        dequant_mxfp4(b_warp_tensor_pong(nIter)(kIter / number<XDL_PerWeightK>{}),
+                                      scale_b_warp_tensor_pong(nIter / number<XDL_PerScaleN>{})(
+                                          kIter / number<XDL_PerScaleK>{}),
+                                      nIter,
+                                      kIter);
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor, a_warp_tensor(number<AwarpIter>{}), dequant_B_n[nIter]);
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor(number<AwarpIter>{}), dequant_B_n[nIter]);
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        __builtin_amdgcn_s_waitcnt(Bload_total_num);
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    __builtin_amdgcn_s_waitcnt(Bload_total_num);
+                    block_sync_lds();
+                }
             });
             prefill_lds_a_stage1(
                 a_copy_lds_window_pong, a_copy_dram_window, number<PrefillAfterGemm>{});
@@ -1058,51 +1053,50 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
             prefill_lds_a_stage2(a_copy_lds_window_pong);
 
             // GEMM loopK-1
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        if constexpr(mIter == 0)
-                            dequant_mxfp4(
-                                b_warp_tensor_ping(nIter)(kIter / number<XDL_PerWeightK>{}),
-                                scale_b_warp_tensor_ping(nIter / number<XDL_PerScaleN>{})(
-                                    kIter / number<XDL_PerScaleK>{}),
-                                nIter,
-                                kIter);
+                    if constexpr(mIter == 0)
+                        dequant_mxfp4(b_warp_tensor_ping(nIter)(kIter / number<XDL_PerWeightK>{}),
+                                      scale_b_warp_tensor_ping(nIter / number<XDL_PerScaleN>{})(
+                                          kIter / number<XDL_PerScaleK>{}),
+                                      nIter,
+                                      kIter);
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor, a_warp_tensor(number<AwarpIter>{}), dequant_B_n[nIter]);
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor(number<AwarpIter>{}), dequant_B_n[nIter]);
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        __builtin_amdgcn_s_waitcnt(Bload_total_num);
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    __builtin_amdgcn_s_waitcnt(Bload_total_num);
+                    block_sync_lds();
+                }
             });
 
             static_for<0, m_preload, 1>{}([&](auto loadIter) {
@@ -1157,11 +1151,6 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
                             a_warp_tensor(number<AwarpIter>{}) =
                                 load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
                         }
-                        // barrier
-                        // if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                        // {
-                        //     block_sync_lds();
-                        // }
                     });
                 }
             });
@@ -1170,50 +1159,49 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
         else if constexpr(TailNum == TailNumber::Odd)
         {
             // GEMM loopK
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        if constexpr(mIter == 0)
-                            dequant_mxfp4(
-                                b_warp_tensor_ping(nIter)(kIter / number<XDL_PerWeightK>{}),
-                                scale_b_warp_tensor_ping(nIter / number<XDL_PerScaleN>{})(
-                                    kIter / number<XDL_PerScaleK>{}),
-                                nIter,
-                                kIter);
-                        // warp GEMM
-                        WG{}(c_warp_tensor, a_warp_tensor(number<AwarpIter>{}), dequant_B_n[nIter]);
+                    if constexpr(mIter == 0)
+                        dequant_mxfp4(b_warp_tensor_ping(nIter)(kIter / number<XDL_PerWeightK>{}),
+                                      scale_b_warp_tensor_ping(nIter / number<XDL_PerScaleN>{})(
+                                          kIter / number<XDL_PerScaleK>{}),
+                                      nIter,
+                                      kIter);
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor(number<AwarpIter>{}), dequant_B_n[nIter]);
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        __builtin_amdgcn_s_waitcnt(Bload_total_num);
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    __builtin_amdgcn_s_waitcnt(Bload_total_num);
+                    block_sync_lds();
+                }
             });
             LastHotLoopScheduler();
         }
@@ -1643,10 +1631,6 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1
                                     ? Aload_rep
                                     : 0;
                 }
-                // if((kIter % KPerScaleLoad == 0) && (mIter == 0))
-                // {
-                //     load_perM = load_perM + 1;
-                // }
                 SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
             }
         }
@@ -1904,29 +1888,29 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1
         });
 
         // prefetch Scale A
-        static_for<0, MIterPerWarp / MXdlPack, 1>{}([&](auto mIter_pack) {
-            static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                scale_a_dram_windows(mIter_pack)(kIter_pack) = scale_a_dram_window;
-                move_tile_window(scale_a_dram_windows(mIter_pack)(kIter_pack),
-                                 {mIter_pack * MWarp * WG::kM, kIter_pack * (64 / WG::kM)});
+        static_ford<sequence<MIterPerWarp / MXdlPack, KIterPerWarp / KXdlPack>>{}([&](auto mk) {
+            constexpr auto mIter_pack                    = number<mk[number<0>{}]>{};
+            constexpr auto kIter_pack                    = number<mk[number<1>{}]>{};
+            scale_a_dram_windows(mIter_pack)(kIter_pack) = scale_a_dram_window;
+            move_tile_window(scale_a_dram_windows(mIter_pack)(kIter_pack),
+                             {mIter_pack * MWarp * WG::kM, kIter_pack * (64 / WG::kM)});
 
-                scale_a_tile_tensor_ping(mIter_pack)(kIter_pack) =
-                    load_tile(scale_a_dram_windows(mIter_pack)(kIter_pack));
-            });
+            scale_a_tile_tensor_ping(mIter_pack)(kIter_pack) =
+                load_tile(scale_a_dram_windows(mIter_pack)(kIter_pack));
         });
         // move Scale A window to next K
         move_tile_window(scale_a_dram_window, {0, kKPerBlock / (32 * KXdlPack)});
 
         // prefetch Scale B
-        static_for<0, NIterPerWarp / NXdlPack, 1>{}([&](auto nIter_pack) {
-            static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                scale_b_dram_windows(nIter_pack)(kIter_pack) = scale_b_dram_window;
-                move_tile_window(scale_b_dram_windows(nIter_pack)(kIter_pack),
-                                 {nIter_pack * NWarp * WG::kN, kIter_pack * (64 / WG::kN)});
+        static_ford<sequence<NIterPerWarp / NXdlPack, KIterPerWarp / KXdlPack>>{}([&](auto nk) {
+            constexpr auto nIter_pack                    = number<nk[number<0>{}]>{};
+            constexpr auto kIter_pack                    = number<nk[number<1>{}]>{};
+            scale_b_dram_windows(nIter_pack)(kIter_pack) = scale_b_dram_window;
+            move_tile_window(scale_b_dram_windows(nIter_pack)(kIter_pack),
+                             {nIter_pack * NWarp * WG::kN, kIter_pack * (64 / WG::kN)});
 
-                scale_b_tile_tensor_ping(nIter_pack)(kIter_pack) =
-                    load_tile(scale_b_dram_windows(nIter_pack)(kIter_pack));
-            });
+            scale_b_tile_tensor_ping(nIter_pack)(kIter_pack) =
+                load_tile(scale_b_dram_windows(nIter_pack)(kIter_pack));
         });
         // move Scale B window to next K
         move_tile_window(scale_b_dram_window, {0, kKPerBlock / (32 * KXdlPack)});
@@ -1957,95 +1941,90 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1
         // MAIN LOOP
         auto main_body_implx2 = [&]() mutable {
             // prefetch B(2i+1)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile_with_offset(
-                        b_flat_dram_windows(nIter), number<kIter * KFlatPerBlockPerIter>{});
-                    if constexpr(kIter == KIterPerWarp - 1)
-                        move_tile_window(b_flat_dram_windows(nIter),
-                                         {0, BlockGemmShape::flatKPerBlock});
-                });
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter             = number<kn[number<0>{}]>{};
+                constexpr auto nIter             = number<kn[number<1>{}]>{};
+                b_warp_tensor_pong(nIter)(kIter) = load_tile_with_offset(
+                    b_flat_dram_windows(nIter), number<kIter * KFlatPerBlockPerIter>{});
+                if constexpr(kIter == KIterPerWarp - 1)
+                    move_tile_window(b_flat_dram_windows(nIter),
+                                     {0, BlockGemmShape::flatKPerBlock});
             });
 
             // prefetch Scale A and Scale B (2i+1)
-            static_for<0, MIterPerWarp / MXdlPack, 1>{}([&](auto mIter_pack) {
-                static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                    scale_a_dram_windows(mIter_pack)(kIter_pack) = scale_a_dram_window;
-                    move_tile_window(scale_a_dram_windows(mIter_pack)(kIter_pack),
-                                     {mIter_pack * MWarp * WG::kM, kIter_pack * (64 / WG::kM)});
+            static_ford<sequence<MIterPerWarp / MXdlPack, KIterPerWarp / KXdlPack>>{}([&](auto mk) {
+                constexpr auto mIter_pack                    = number<mk[number<0>{}]>{};
+                constexpr auto kIter_pack                    = number<mk[number<1>{}]>{};
+                scale_a_dram_windows(mIter_pack)(kIter_pack) = scale_a_dram_window;
+                move_tile_window(scale_a_dram_windows(mIter_pack)(kIter_pack),
+                                 {mIter_pack * MWarp * WG::kM, kIter_pack * (64 / WG::kM)});
 
-                    scale_a_tile_tensor_pong(mIter_pack)(kIter_pack) =
-                        load_tile(scale_a_dram_windows(mIter_pack)(kIter_pack));
-                });
+                scale_a_tile_tensor_pong(mIter_pack)(kIter_pack) =
+                    load_tile(scale_a_dram_windows(mIter_pack)(kIter_pack));
             });
 
-            static_for<0, NIterPerWarp / NXdlPack, 1>{}([&](auto nIter_pack) {
-                static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                    scale_b_dram_windows(nIter_pack)(kIter_pack) = scale_b_dram_window;
-                    move_tile_window(scale_b_dram_windows(nIter_pack)(kIter_pack),
-                                     {nIter_pack * NWarp * WG::kN, kIter_pack * (64 / WG::kN)});
+            static_ford<sequence<NIterPerWarp / NXdlPack, KIterPerWarp / KXdlPack>>{}([&](auto nk) {
+                constexpr auto nIter_pack                    = number<nk[number<0>{}]>{};
+                constexpr auto kIter_pack                    = number<nk[number<1>{}]>{};
+                scale_b_dram_windows(nIter_pack)(kIter_pack) = scale_b_dram_window;
+                move_tile_window(scale_b_dram_windows(nIter_pack)(kIter_pack),
+                                 {nIter_pack * NWarp * WG::kN, kIter_pack * (64 / WG::kN)});
 
-                    scale_b_tile_tensor_pong(nIter_pack)(kIter_pack) =
-                        load_tile(scale_b_dram_windows(nIter_pack)(kIter_pack));
-                });
+                scale_b_tile_tensor_pong(nIter_pack)(kIter_pack) =
+                    load_tile(scale_b_dram_windows(nIter_pack)(kIter_pack));
             });
 
             // GEMM 2i
-            static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                static_for<0, MIterPerWarp / MXdlPack, 1>{}([&](auto mIter_pack) {
-                    static_for<0, NIterPerWarp / NXdlPack, 1>{}([&](auto nIter_pack) {
-                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                                constexpr auto AwarpIter = imxdl + ikxdl * MXdlPack;
-                                constexpr auto m_iter    = mIter_pack * MXdlPack + imxdl;
-                                constexpr auto k_iter    = kIter_pack * KXdlPack + ikxdl;
-                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                    constexpr auto n_iter = nIter_pack * NXdlPack + inxdl;
+            static_ford<sequence<KIterPerWarp / KXdlPack,
+                                 MIterPerWarp / MXdlPack,
+                                 NIterPerWarp / NXdlPack,
+                                 KXdlPack,
+                                 MXdlPack>>{}([&](auto idx) {
+                constexpr auto kIter_pack = number<idx[number<0>{}]>{};
+                constexpr auto mIter_pack = number<idx[number<1>{}]>{};
+                constexpr auto nIter_pack = number<idx[number<2>{}]>{};
+                constexpr auto ikxdl      = number<idx[number<3>{}]>{};
+                constexpr auto imxdl      = number<idx[number<4>{}]>{};
+                constexpr auto AwarpIter  = imxdl + ikxdl * MXdlPack;
+                constexpr auto m_iter     = mIter_pack * MXdlPack + imxdl;
+                constexpr auto k_iter     = kIter_pack * KXdlPack + ikxdl;
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                    constexpr auto n_iter = nIter_pack * NXdlPack + inxdl;
 
-                                    // read C warp tensor from C block tensor
-                                    CWarpTensor c_warp_tensor;
-                                    c_warp_tensor.get_thread_buffer() =
-                                        c_block_tile.get_y_sliced_thread_data(
-                                            merge_sequences(sequence<m_iter, n_iter>{},
-                                                            c_warp_y_index_zeros),
-                                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<m_iter, n_iter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                                    //  warp GEMM
-                                    WG{}.template
-                                    // operator()<MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
-                                    operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
-                                        c_warp_tensor,
-                                        a_warp_tensor(number<AwarpIter>{}),
-                                        b_warp_tensor_ping(nIter_pack * number<NXdlPack>{} + inxdl)(
-                                            kIter_pack * number<KXdlPack>{} + ikxdl),
-                                        scale_a_tile_tensor_ping(mIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0],
-                                        scale_b_tile_tensor_ping(nIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0]);
+                    //  warp GEMM
+                    WG{}.template
+                    // operator()<MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
+                    operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
+                        c_warp_tensor,
+                        a_warp_tensor(number<AwarpIter>{}),
+                        b_warp_tensor_ping(nIter_pack * number<NXdlPack>{} +
+                                           inxdl)(kIter_pack * number<KXdlPack>{} + ikxdl),
+                        scale_a_tile_tensor_ping(mIter_pack)(kIter_pack).get_thread_buffer()[0],
+                        scale_b_tile_tensor_ping(nIter_pack)(kIter_pack).get_thread_buffer()[0]);
 
-                                    // write C warp tensor into C block tensor
-                                    c_block_tile.set_y_sliced_thread_data(
-                                        merge_sequences(sequence<m_iter, n_iter>{},
-                                                        c_warp_y_index_zeros),
-                                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                                        c_warp_tensor.get_thread_buffer());
-                                });
-                                // preload next A from lds
-                                constexpr auto addr =
-                                    m_iter % 2 + k_iter * 2 + m_iter / 2 * 4 + m_preload;
-                                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
-                                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
-                                {
-                                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
-                                    constexpr auto AkIter              = addr / 2 % 2;
-                                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
-                                        a_warp_window_ping,
-                                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
-                                }
-                            });
-                        });
-                    });
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<m_iter, n_iter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                constexpr auto addr = m_iter % 2 + k_iter * 2 + m_iter / 2 * 4 + m_preload;
+                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
+                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
+                {
+                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
+                    constexpr auto AkIter              = addr / 2 % 2;
+                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
+                        a_warp_window_ping,
+                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
+                }
             });
             // barrier as ds_load A(2i) and buffer_load_lds A(2i + 1) finished
             s_waitcnt< // vmcnt
@@ -2072,96 +2051,94 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1
             ////////////////////////////// Next K //////////////////////////////
 
             // prefetch B(2i+2)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_warp_tensor_ping(nIter)(kIter) = load_tile_with_offset(
-                        b_flat_dram_windows(nIter), number<kIter * KFlatPerBlockPerIter>{});
-                    if constexpr(kIter == KIterPerWarp - 1)
-                        move_tile_window(b_flat_dram_windows(nIter),
-                                         {0, BlockGemmShape::flatKPerBlock});
-                });
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter             = number<kn[number<0>{}]>{};
+                constexpr auto nIter             = number<kn[number<1>{}]>{};
+                b_warp_tensor_ping(nIter)(kIter) = load_tile_with_offset(
+                    b_flat_dram_windows(nIter), number<kIter * KFlatPerBlockPerIter>{});
+                if constexpr(kIter == KIterPerWarp - 1)
+                    move_tile_window(b_flat_dram_windows(nIter),
+                                     {0, BlockGemmShape::flatKPerBlock});
             });
 
             // prefetch Scale A and Scale B (2i+2)
-            static_for<0, MIterPerWarp / MXdlPack, 1>{}([&](auto mIter_pack) {
-                static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                    scale_a_dram_windows(mIter_pack)(kIter_pack) = scale_a_dram_window;
-                    move_tile_window(scale_a_dram_windows(mIter_pack)(kIter_pack),
-                                     {mIter_pack * MWarp * WG::kM, kIter_pack * (64 / WG::kM)});
+            static_ford<sequence<MIterPerWarp / MXdlPack, KIterPerWarp / KXdlPack>>{}([&](auto mk) {
+                constexpr auto mIter_pack                    = number<mk[number<0>{}]>{};
+                constexpr auto kIter_pack                    = number<mk[number<1>{}]>{};
+                scale_a_dram_windows(mIter_pack)(kIter_pack) = scale_a_dram_window;
+                move_tile_window(scale_a_dram_windows(mIter_pack)(kIter_pack),
+                                 {mIter_pack * MWarp * WG::kM, kIter_pack * (64 / WG::kM)});
 
-                    scale_a_tile_tensor_ping(mIter_pack)(kIter_pack) =
-                        load_tile(scale_a_dram_windows(mIter_pack)(kIter_pack));
-                });
+                scale_a_tile_tensor_ping(mIter_pack)(kIter_pack) =
+                    load_tile(scale_a_dram_windows(mIter_pack)(kIter_pack));
             });
 
-            static_for<0, NIterPerWarp / NXdlPack, 1>{}([&](auto nIter_pack) {
-                static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                    scale_b_dram_windows(nIter_pack)(kIter_pack) = scale_b_dram_window;
-                    move_tile_window(scale_b_dram_windows(nIter_pack)(kIter_pack),
-                                     {nIter_pack * NWarp * WG::kN, kIter_pack * (64 / WG::kN)});
+            static_ford<sequence<NIterPerWarp / NXdlPack, KIterPerWarp / KXdlPack>>{}([&](auto nk) {
+                constexpr auto nIter_pack                    = number<nk[number<0>{}]>{};
+                constexpr auto kIter_pack                    = number<nk[number<1>{}]>{};
+                scale_b_dram_windows(nIter_pack)(kIter_pack) = scale_b_dram_window;
+                move_tile_window(scale_b_dram_windows(nIter_pack)(kIter_pack),
+                                 {nIter_pack * NWarp * WG::kN, kIter_pack * (64 / WG::kN)});
 
-                    scale_b_tile_tensor_ping(nIter_pack)(kIter_pack) =
-                        load_tile(scale_b_dram_windows(nIter_pack)(kIter_pack));
-                });
+                scale_b_tile_tensor_ping(nIter_pack)(kIter_pack) =
+                    load_tile(scale_b_dram_windows(nIter_pack)(kIter_pack));
             });
 
             // GEMM 2i+1
-            static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                static_for<0, MIterPerWarp / MXdlPack, 1>{}([&](auto mIter_pack) {
-                    static_for<0, NIterPerWarp / NXdlPack, 1>{}([&](auto nIter_pack) {
-                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                                constexpr auto AwarpIter = imxdl + ikxdl * MXdlPack;
-                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                    // read C warp tensor from C block tensor
-                                    CWarpTensor c_warp_tensor;
-                                    c_warp_tensor.get_thread_buffer() =
-                                        c_block_tile.get_y_sliced_thread_data(
-                                            merge_sequences(
-                                                sequence<mIter_pack * MXdlPack + imxdl,
-                                                         nIter_pack * NXdlPack + inxdl>{},
-                                                c_warp_y_index_zeros),
-                                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            static_ford<sequence<KIterPerWarp / KXdlPack,
+                                 MIterPerWarp / MXdlPack,
+                                 NIterPerWarp / NXdlPack,
+                                 KXdlPack,
+                                 MXdlPack>>{}([&](auto idx) {
+                constexpr auto kIter_pack = number<idx[number<0>{}]>{};
+                constexpr auto mIter_pack = number<idx[number<1>{}]>{};
+                constexpr auto nIter_pack = number<idx[number<2>{}]>{};
+                constexpr auto ikxdl      = number<idx[number<3>{}]>{};
+                constexpr auto imxdl      = number<idx[number<4>{}]>{};
+                constexpr auto AwarpIter  = imxdl + ikxdl * MXdlPack;
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter_pack * MXdlPack + imxdl,
+                                                 nIter_pack * NXdlPack + inxdl>{},
+                                        c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                                    // warp GEMM
-                                    WG{}.template
-                                    // operator()<MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
-                                    operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
-                                        c_warp_tensor,
-                                        a_warp_tensor(number<AwarpIter>{}),
-                                        b_warp_tensor_pong(nIter_pack * number<NXdlPack>{} + inxdl)(
-                                            kIter_pack * number<KXdlPack>{} + ikxdl),
-                                        scale_a_tile_tensor_pong(mIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0], // scale A
-                                        scale_b_tile_tensor_pong(nIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0]); // scale B
+                    // warp GEMM
+                    WG{}.template
+                    // operator()<MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
+                    operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
+                        c_warp_tensor,
+                        a_warp_tensor(number<AwarpIter>{}),
+                        b_warp_tensor_pong(nIter_pack * number<NXdlPack>{} +
+                                           inxdl)(kIter_pack * number<KXdlPack>{} + ikxdl),
+                        scale_a_tile_tensor_pong(mIter_pack)(kIter_pack)
+                            .get_thread_buffer()[0], // scale A
+                        scale_b_tile_tensor_pong(nIter_pack)(kIter_pack)
+                            .get_thread_buffer()[0]); // scale B
 
-                                    // write C warp tensor into C block tensor
-                                    c_block_tile.set_y_sliced_thread_data(
-                                        merge_sequences(sequence<mIter_pack * MXdlPack + imxdl,
-                                                                 nIter_pack * NXdlPack + inxdl>{},
-                                                        c_warp_y_index_zeros),
-                                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                                        c_warp_tensor.get_thread_buffer());
-                                });
-                                // preload next A from lds
-                                constexpr auto addr = (mIter_pack * MXdlPack + imxdl) % 2 +
-                                                      (kIter_pack * KXdlPack + ikxdl) * 2 +
-                                                      (mIter_pack * MXdlPack + imxdl) / 2 * 4 +
-                                                      m_preload;
-                                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
-                                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
-                                {
-                                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
-                                    constexpr auto AkIter              = addr / 2 % 2;
-                                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
-                                        a_warp_window_pong,
-                                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
-                                }
-                            });
-                        });
-                    });
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter_pack * MXdlPack + imxdl,
+                                                 nIter_pack * NXdlPack + inxdl>{},
+                                        c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                constexpr auto addr = (mIter_pack * MXdlPack + imxdl) % 2 +
+                                      (kIter_pack * KXdlPack + ikxdl) * 2 +
+                                      (mIter_pack * MXdlPack + imxdl) / 2 * 4 + m_preload;
+                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
+                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
+                {
+                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
+                    constexpr auto AkIter              = addr / 2 % 2;
+                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
+                        a_warp_window_pong,
+                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
+                }
             });
             // barrier as ds_load A(2i + 1) and buffer_load_lds A(2i + 2) finished
             s_waitcnt< // vmcnt
@@ -2199,92 +2176,89 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1
         if constexpr(TailNum == TailNumber::Even)
         {
             // prefetch B(loopK)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile_with_offset(
-                        b_flat_dram_windows(nIter),
-                        make_tuple(number<0>{}, number<kIter * KFlatPerBlockPerIter>{}));
-                });
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter             = number<kn[number<0>{}]>{};
+                constexpr auto nIter             = number<kn[number<1>{}]>{};
+                b_warp_tensor_pong(nIter)(kIter) = load_tile_with_offset(
+                    b_flat_dram_windows(nIter),
+                    make_tuple(number<0>{}, number<kIter * KFlatPerBlockPerIter>{}));
             });
 
             // prefetch Scale A and Scale B (2i+1)
-            static_for<0, MIterPerWarp / MXdlPack, 1>{}([&](auto mIter_pack) {
-                static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                    scale_a_dram_windows(mIter_pack)(kIter_pack) = scale_a_dram_window;
-                    move_tile_window(scale_a_dram_windows(mIter_pack)(kIter_pack),
-                                     {mIter_pack * MWarp * WG::kM, kIter_pack * (64 / WG::kM)});
+            static_ford<sequence<MIterPerWarp / MXdlPack, KIterPerWarp / KXdlPack>>{}([&](auto mk) {
+                constexpr auto mIter_pack                    = number<mk[number<0>{}]>{};
+                constexpr auto kIter_pack                    = number<mk[number<1>{}]>{};
+                scale_a_dram_windows(mIter_pack)(kIter_pack) = scale_a_dram_window;
+                move_tile_window(scale_a_dram_windows(mIter_pack)(kIter_pack),
+                                 {mIter_pack * MWarp * WG::kM, kIter_pack * (64 / WG::kM)});
 
-                    scale_a_tile_tensor_pong(mIter_pack)(kIter_pack) =
-                        load_tile(scale_a_dram_windows(mIter_pack)(kIter_pack));
-                });
+                scale_a_tile_tensor_pong(mIter_pack)(kIter_pack) =
+                    load_tile(scale_a_dram_windows(mIter_pack)(kIter_pack));
             });
-            static_for<0, NIterPerWarp / NXdlPack, 1>{}([&](auto nIter_pack) {
-                static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                    scale_b_dram_windows(nIter_pack)(kIter_pack) = scale_b_dram_window;
-                    move_tile_window(scale_b_dram_windows(nIter_pack)(kIter_pack),
-                                     {nIter_pack * NWarp * WG::kN, kIter_pack * (64 / WG::kN)});
+            static_ford<sequence<NIterPerWarp / NXdlPack, KIterPerWarp / KXdlPack>>{}([&](auto nk) {
+                constexpr auto nIter_pack                    = number<nk[number<0>{}]>{};
+                constexpr auto kIter_pack                    = number<nk[number<1>{}]>{};
+                scale_b_dram_windows(nIter_pack)(kIter_pack) = scale_b_dram_window;
+                move_tile_window(scale_b_dram_windows(nIter_pack)(kIter_pack),
+                                 {nIter_pack * NWarp * WG::kN, kIter_pack * (64 / WG::kN)});
 
-                    scale_b_tile_tensor_pong(nIter_pack)(kIter_pack) =
-                        load_tile(scale_b_dram_windows(nIter_pack)(kIter_pack));
-                });
+                scale_b_tile_tensor_pong(nIter_pack)(kIter_pack) =
+                    load_tile(scale_b_dram_windows(nIter_pack)(kIter_pack));
             });
 
             // GEMM loopK-1
-            static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                static_for<0, MIterPerWarp / MXdlPack, 1>{}([&](auto mIter_pack) {
-                    static_for<0, NIterPerWarp / NXdlPack, 1>{}([&](auto nIter_pack) {
-                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                                constexpr auto AwarpIter = imxdl + ikxdl * MXdlPack;
-                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                    // read C warp tensor from C block tensor
-                                    CWarpTensor c_warp_tensor;
-                                    c_warp_tensor.get_thread_buffer() =
-                                        c_block_tile.get_y_sliced_thread_data(
-                                            merge_sequences(
-                                                sequence<mIter_pack * MXdlPack + imxdl,
-                                                         nIter_pack * NXdlPack + inxdl>{},
-                                                c_warp_y_index_zeros),
-                                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            static_ford<sequence<KIterPerWarp / KXdlPack,
+                                 MIterPerWarp / MXdlPack,
+                                 NIterPerWarp / NXdlPack,
+                                 KXdlPack,
+                                 MXdlPack>>{}([&](auto idx) {
+                constexpr auto kIter_pack = number<idx[number<0>{}]>{};
+                constexpr auto mIter_pack = number<idx[number<1>{}]>{};
+                constexpr auto nIter_pack = number<idx[number<2>{}]>{};
+                constexpr auto ikxdl      = number<idx[number<3>{}]>{};
+                constexpr auto imxdl      = number<idx[number<4>{}]>{};
+                constexpr auto AwarpIter  = imxdl + ikxdl * MXdlPack;
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter_pack * MXdlPack + imxdl,
+                                                 nIter_pack * NXdlPack + inxdl>{},
+                                        c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                                    // warp GEMM
-                                    WG{}.template
-                                    operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
-                                        c_warp_tensor,
-                                        a_warp_tensor(number<AwarpIter>{}),
-                                        b_warp_tensor_ping(nIter_pack * number<NXdlPack>{} + inxdl)(
-                                            kIter_pack * number<KXdlPack>{} + ikxdl),
-                                        scale_a_tile_tensor_ping(mIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0], // scale A
-                                        scale_b_tile_tensor_ping(nIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0]); // scale B
+                    // warp GEMM
+                    WG{}.template operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
+                        c_warp_tensor,
+                        a_warp_tensor(number<AwarpIter>{}),
+                        b_warp_tensor_ping(nIter_pack * number<NXdlPack>{} +
+                                           inxdl)(kIter_pack * number<KXdlPack>{} + ikxdl),
+                        scale_a_tile_tensor_ping(mIter_pack)(kIter_pack)
+                            .get_thread_buffer()[0], // scale A
+                        scale_b_tile_tensor_ping(nIter_pack)(kIter_pack)
+                            .get_thread_buffer()[0]); // scale B
 
-                                    // write C warp tensor into C block tensor
-                                    c_block_tile.set_y_sliced_thread_data(
-                                        merge_sequences(sequence<mIter_pack * MXdlPack + imxdl,
-                                                                 nIter_pack * NXdlPack + inxdl>{},
-                                                        c_warp_y_index_zeros),
-                                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                                        c_warp_tensor.get_thread_buffer());
-                                });
-                                // preload next A from lds
-                                constexpr auto addr = (mIter_pack * MXdlPack + imxdl) % 2 +
-                                                      (kIter_pack * KXdlPack + ikxdl) * 2 +
-                                                      (mIter_pack * MXdlPack + imxdl) / 2 * 4 +
-                                                      m_preload;
-                                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
-                                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
-                                {
-                                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
-                                    constexpr auto AkIter              = addr / 2 % 2;
-                                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
-                                        a_warp_window_ping,
-                                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
-                                }
-                            });
-                        });
-                    });
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter_pack * MXdlPack + imxdl,
+                                                 nIter_pack * NXdlPack + inxdl>{},
+                                        c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                constexpr auto addr = (mIter_pack * MXdlPack + imxdl) % 2 +
+                                      (kIter_pack * KXdlPack + ikxdl) * 2 +
+                                      (mIter_pack * MXdlPack + imxdl) / 2 * 4 + m_preload;
+                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
+                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
+                {
+                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
+                    constexpr auto AkIter              = addr / 2 % 2;
+                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
+                        a_warp_window_ping,
+                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
+                }
             });
             // barrier as ds_load A(2i) and buffer_load_lds A(2i + 1) finished
             s_waitcnt< // vmcnt
@@ -2302,123 +2276,115 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1
             // Last2ndHotLoopScheduler();
 
             // GEMM loopK
-            static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                static_for<0, MIterPerWarp / MXdlPack, 1>{}([&](auto mIter_pack) {
-                    static_for<0, NIterPerWarp / NXdlPack, 1>{}([&](auto nIter_pack) {
-                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                                constexpr auto AwarpIter = imxdl + ikxdl * MXdlPack;
-                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                    // read C warp tensor from C block tensor
-                                    CWarpTensor c_warp_tensor;
-                                    c_warp_tensor.get_thread_buffer() =
-                                        c_block_tile.get_y_sliced_thread_data(
-                                            merge_sequences(
-                                                sequence<mIter_pack * MXdlPack + imxdl,
-                                                         nIter_pack * NXdlPack + inxdl>{},
-                                                c_warp_y_index_zeros),
-                                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            static_ford<sequence<KIterPerWarp / KXdlPack,
+                                 MIterPerWarp / MXdlPack,
+                                 NIterPerWarp / NXdlPack,
+                                 KXdlPack,
+                                 MXdlPack>>{}([&](auto idx) {
+                constexpr auto kIter_pack = number<idx[number<0>{}]>{};
+                constexpr auto mIter_pack = number<idx[number<1>{}]>{};
+                constexpr auto nIter_pack = number<idx[number<2>{}]>{};
+                constexpr auto ikxdl      = number<idx[number<3>{}]>{};
+                constexpr auto imxdl      = number<idx[number<4>{}]>{};
+                constexpr auto AwarpIter  = imxdl + ikxdl * MXdlPack;
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter_pack * MXdlPack + imxdl,
+                                                 nIter_pack * NXdlPack + inxdl>{},
+                                        c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                                    // warp GEMM
-                                    WG{}.template
-                                    operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
-                                        //  operator()<MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
-                                        c_warp_tensor,
-                                        a_warp_tensor(number<AwarpIter>{}),
-                                        b_warp_tensor_pong(nIter_pack * number<NXdlPack>{} + inxdl)(
-                                            kIter_pack * number<KXdlPack>{} + ikxdl),
-                                        scale_a_tile_tensor_pong(mIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0], // scale A
-                                        scale_b_tile_tensor_pong(nIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0]); // scale B
+                    // warp GEMM
+                    WG{}.template operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
+                        //  operator()<MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
+                        c_warp_tensor,
+                        a_warp_tensor(number<AwarpIter>{}),
+                        b_warp_tensor_pong(nIter_pack * number<NXdlPack>{} +
+                                           inxdl)(kIter_pack * number<KXdlPack>{} + ikxdl),
+                        scale_a_tile_tensor_pong(mIter_pack)(kIter_pack)
+                            .get_thread_buffer()[0], // scale A
+                        scale_b_tile_tensor_pong(nIter_pack)(kIter_pack)
+                            .get_thread_buffer()[0]); // scale B
 
-                                    // write C warp tensor into C block tensor
-                                    c_block_tile.set_y_sliced_thread_data(
-                                        merge_sequences(sequence<mIter_pack * MXdlPack + imxdl,
-                                                                 nIter_pack * NXdlPack + inxdl>{},
-                                                        c_warp_y_index_zeros),
-                                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                                        c_warp_tensor.get_thread_buffer());
-                                });
-                                // preload next A from lds
-                                constexpr auto addr = (mIter_pack * MXdlPack + imxdl) % 2 +
-                                                      (kIter_pack * KXdlPack + ikxdl) * 2 +
-                                                      (mIter_pack * MXdlPack + imxdl) / 2 * 4 +
-                                                      m_preload;
-                                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
-                                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
-                                {
-                                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
-                                    constexpr auto AkIter              = addr / 2 % 2;
-                                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
-                                        a_warp_window_pong,
-                                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
-                                }
-                            });
-                        });
-                    });
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter_pack * MXdlPack + imxdl,
+                                                 nIter_pack * NXdlPack + inxdl>{},
+                                        c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                constexpr auto addr = (mIter_pack * MXdlPack + imxdl) % 2 +
+                                      (kIter_pack * KXdlPack + ikxdl) * 2 +
+                                      (mIter_pack * MXdlPack + imxdl) / 2 * 4 + m_preload;
+                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
+                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
+                {
+                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
+                    constexpr auto AkIter              = addr / 2 % 2;
+                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
+                        a_warp_window_pong,
+                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
+                }
             });
             // LastHotLoopScheduler();
         }
         else if constexpr(TailNum == TailNumber::Odd)
         {
             // GEMM loopK
-            static_for<0, KIterPerWarp / KXdlPack, 1>{}([&](auto kIter_pack) {
-                static_for<0, MIterPerWarp / MXdlPack, 1>{}([&](auto mIter_pack) {
-                    static_for<0, NIterPerWarp / NXdlPack, 1>{}([&](auto nIter_pack) {
-                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                                constexpr auto AwarpIter = imxdl + ikxdl * MXdlPack;
-                                constexpr auto m_iter    = mIter_pack * MXdlPack + imxdl;
-                                constexpr auto k_iter    = kIter_pack * KXdlPack + ikxdl;
-                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                    constexpr auto n_iter = nIter_pack * NXdlPack + inxdl;
+            static_ford<sequence<KIterPerWarp / KXdlPack,
+                                 MIterPerWarp / MXdlPack,
+                                 NIterPerWarp / NXdlPack,
+                                 KXdlPack,
+                                 MXdlPack>>{}([&](auto idx) {
+                constexpr auto kIter_pack = number<idx[number<0>{}]>{};
+                constexpr auto mIter_pack = number<idx[number<1>{}]>{};
+                constexpr auto nIter_pack = number<idx[number<2>{}]>{};
+                constexpr auto ikxdl      = number<idx[number<3>{}]>{};
+                constexpr auto imxdl      = number<idx[number<4>{}]>{};
+                constexpr auto AwarpIter  = imxdl + ikxdl * MXdlPack;
+                constexpr auto m_iter     = mIter_pack * MXdlPack + imxdl;
+                constexpr auto k_iter     = kIter_pack * KXdlPack + ikxdl;
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                    constexpr auto n_iter = nIter_pack * NXdlPack + inxdl;
 
-                                    // read C warp tensor from C block tensor
-                                    CWarpTensor c_warp_tensor;
-                                    c_warp_tensor.get_thread_buffer() =
-                                        c_block_tile.get_y_sliced_thread_data(
-                                            merge_sequences(sequence<m_iter, n_iter>{},
-                                                            c_warp_y_index_zeros),
-                                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<m_iter, n_iter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                                    //  warp GEMM
-                                    WG{}.template
-                                    // operator()<MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
-                                    operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
-                                        c_warp_tensor,
-                                        a_warp_tensor(number<AwarpIter>{}),
-                                        b_warp_tensor_ping(nIter_pack * number<NXdlPack>{} + inxdl)(
-                                            kIter_pack * number<KXdlPack>{} + ikxdl),
-                                        scale_a_tile_tensor_ping(mIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0],
-                                        scale_b_tile_tensor_ping(nIter_pack)(kIter_pack)
-                                            .get_thread_buffer()[0]);
+                    //  warp GEMM
+                    WG{}.template
+                    // operator()<MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
+                    operator()<ikxdl * MXdlPack + imxdl, ikxdl * NXdlPack + inxdl>(
+                        c_warp_tensor,
+                        a_warp_tensor(number<AwarpIter>{}),
+                        b_warp_tensor_ping(nIter_pack * number<NXdlPack>{} +
+                                           inxdl)(kIter_pack * number<KXdlPack>{} + ikxdl),
+                        scale_a_tile_tensor_ping(mIter_pack)(kIter_pack).get_thread_buffer()[0],
+                        scale_b_tile_tensor_ping(nIter_pack)(kIter_pack).get_thread_buffer()[0]);
 
-                                    // write C warp tensor into C block tensor
-                                    c_block_tile.set_y_sliced_thread_data(
-                                        merge_sequences(sequence<m_iter, n_iter>{},
-                                                        c_warp_y_index_zeros),
-                                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                                        c_warp_tensor.get_thread_buffer());
-                                });
-                                // preload next A from lds
-                                constexpr auto addr =
-                                    m_iter % 2 + k_iter * 2 + m_iter / 2 * 4 + m_preload;
-                                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
-                                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
-                                {
-                                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
-                                    constexpr auto AkIter              = addr / 2 % 2;
-                                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
-                                        a_warp_window_ping,
-                                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
-                                }
-                            });
-                        });
-                    });
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<m_iter, n_iter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                constexpr auto addr = m_iter % 2 + k_iter * 2 + m_iter / 2 * 4 + m_preload;
+                if constexpr(addr < (KIterPerWarp * MIterPerWarp) &&
+                             (nIter_pack == NIterPerWarp / NXdlPack - 1))
+                {
+                    constexpr auto AmIter              = addr % 2 + addr / 4 * 2;
+                    constexpr auto AkIter              = addr / 2 % 2;
+                    a_warp_tensor(number<AwarpIter>{}) = load_tile_with_offset(
+                        a_warp_window_ping,
+                        tuple<number<AmIter * WG::kM>, number<AkIter * WG::kK>>{});
+                }
             });
             // barrier as ds_load A(2i) and buffer_load_lds A(2i + 1) finished
             s_waitcnt< // vmcnt
diff --git a/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp b/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
index 5681726afe..fd1bb6da5a 100644
--- a/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
@@ -103,13 +103,8 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
     static constexpr index_t Aload_num_perK = dswrite_num_perK;
     static constexpr index_t Aload_rep      = dswrite_rep;
     static constexpr index_t Bload_num_perK = kNPerBlock * WG::kK / NWarp / BK1 / WaveSize;
-    // static constexpr index_t ScaleBload_K1  = ContinuousScaleNPerThread *
-    // ContinuousScaleKPerThread; static constexpr index_t ScaleBload_num =
-    //     kNPerBlock * kKPerBlock / NWarp / 32 / ScaleBload_K1 /
-    //     WaveSize; // BlockN * BlockK / NWarp / ScalePerK / ScaleB_K1 / wavesize
-    // static constexpr index_t KPerScaleLoad = KIterPerWarp / ScaleBload_num;
-    static constexpr index_t HalfMIter = (MIterPerWarp + 1) / 2;
-    static constexpr index_t Bload_rep = (Bload_num_perK + HalfMIter - 1) / HalfMIter;
+    static constexpr index_t HalfMIter      = (MIterPerWarp + 1) / 2;
+    static constexpr index_t Bload_rep      = (Bload_num_perK + HalfMIter - 1) / HalfMIter;
 
     static constexpr index_t mfma_perM_perK = NIterPerWarp * mfma_per_wg;
     static constexpr index_t dswrite_mIter  = (DsWritePreIssue - 1) % MIterPerWarp;
@@ -352,10 +347,6 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
                                     ? Aload_rep
                                     : 0;
                 }
-                // if((kIter % KPerScaleLoad == 0) && (mIter == 0))
-                // {
-                //     load_perM = load_perM + 1;
-                // }
                 SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
             }
         }
@@ -529,22 +520,22 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
             MIterPerWarp>
             a_warp_windows_pong;
 
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter              = number<mk[number<0>{}]>{};
+            constexpr auto kIter              = number<mk[number<1>{}]>{};
+            a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
 
-                move_tile_window(a_warp_windows_ping(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(a_warp_windows_ping(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter              = number<mk[number<0>{}]>{};
+            constexpr auto kIter              = number<mk[number<1>{}]>{};
+            a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
 
-                move_tile_window(a_warp_windows_pong(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(a_warp_windows_pong(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 
         // Block GEMM
@@ -592,26 +583,26 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
                                 2;
 
         // prefetch B
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+        static_ford<sequence<NIterPerWarp, KIterPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter              = number<nk[number<0>{}]>{};
+            constexpr auto kIter              = number<nk[number<1>{}]>{};
+            b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                if constexpr(!IsGateUpMode)
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+            if constexpr(!IsGateUpMode)
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+            else
+            {
+                if constexpr(nIter % 2 == 0)
+                    move_tile_window(
+                        b_flat_dram_windows(nIter)(kIter),
+                        {nIter / 2 * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
                 else
-                {
-                    if constexpr(nIter % 2 == 0)
-                        move_tile_window(
-                            b_flat_dram_windows(nIter)(kIter),
-                            {nIter / 2 * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
-                    else
-                        move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                         {nIter / 2 * NFlatPerBlockPerIter + up_weight_stride,
-                                          kIter * KFlatPerBlockPerIter});
-                }
-                b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
-            });
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter / 2 * NFlatPerBlockPerIter + up_weight_stride,
+                                      kIter * KFlatPerBlockPerIter});
+            }
+            b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
         });
         // move B window to next flat K
         move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
@@ -648,28 +639,27 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
         while(iCounter > 0)
         {
             // prefetch B(2i+1)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    if constexpr(!IsGateUpMode)
+                if constexpr(!IsGateUpMode)
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+                else
+                {
+                    if constexpr(nIter % 2 == 0)
                         move_tile_window(
                             b_flat_dram_windows(nIter)(kIter),
-                            {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+                            {nIter / 2 * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
                     else
-                    {
-                        if constexpr(nIter % 2 == 0)
-                            move_tile_window(
-                                b_flat_dram_windows(nIter)(kIter),
-                                {nIter / 2 * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
-                        else
-                            move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                             {nIter / 2 * NFlatPerBlockPerIter + up_weight_stride,
-                                              kIter * KFlatPerBlockPerIter});
-                    }
+                        move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                         {nIter / 2 * NFlatPerBlockPerIter + up_weight_stride,
+                                          kIter * KFlatPerBlockPerIter});
+                }
 
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
-                });
+                b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
             });
 
             // Prefill A(2i+1)
@@ -682,44 +672,44 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
             move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
             // GEMM 2i
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_ping(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_ping(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
 
             // move B window to next flat K
@@ -736,28 +726,27 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
             // Next K
 
             // prefetch B(2i+2)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    if constexpr(!IsGateUpMode)
+                if constexpr(!IsGateUpMode)
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+                else
+                {
+                    if constexpr(nIter % 2 == 0)
                         move_tile_window(
                             b_flat_dram_windows(nIter)(kIter),
-                            {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+                            {nIter / 2 * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
                     else
-                    {
-                        if constexpr(nIter % 2 == 0)
-                            move_tile_window(
-                                b_flat_dram_windows(nIter)(kIter),
-                                {nIter / 2 * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
-                        else
-                            move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                             {nIter / 2 * NFlatPerBlockPerIter + up_weight_stride,
-                                              kIter * KFlatPerBlockPerIter});
-                    }
+                        move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                         {nIter / 2 * NFlatPerBlockPerIter + up_weight_stride,
+                                          kIter * KFlatPerBlockPerIter});
+                }
 
-                    b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
-                });
+                b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
             });
 
             // Prefill A(2i+2)
@@ -770,43 +759,43 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
             move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
             // GEMM 2i+1
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_pong(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_pong(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
 
             // move B window to next flat K
@@ -827,28 +816,27 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
         if constexpr(TailNum == TailNumber::Even)
         {
             // prefetch B(loopK)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    if constexpr(!IsGateUpMode)
+                if constexpr(!IsGateUpMode)
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+                else
+                {
+                    if constexpr(nIter % 2 == 0)
                         move_tile_window(
                             b_flat_dram_windows(nIter)(kIter),
-                            {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+                            {nIter / 2 * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
                     else
-                    {
-                        if constexpr(nIter % 2 == 0)
-                            move_tile_window(
-                                b_flat_dram_windows(nIter)(kIter),
-                                {nIter / 2 * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
-                        else
-                            move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                             {nIter / 2 * NFlatPerBlockPerIter + up_weight_stride,
-                                              kIter * KFlatPerBlockPerIter});
-                    }
+                        move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                         {nIter / 2 * NFlatPerBlockPerIter + up_weight_stride,
+                                          kIter * KFlatPerBlockPerIter});
+                }
 
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
-                });
+                b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
             });
 
             // Prefill A(loopK)
@@ -856,44 +844,44 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
             store_tile(a_copy_lds_window_pong, a_block_tile_tmp);
 
             // GEMM loopK-1
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_ping(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_ping(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
 
             static_for<0, m_preload, 1>{}([&](auto loadIter) {
@@ -906,86 +894,86 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
             Last2ndHotLoopScheduler();
 
             // GEMM loopK
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_pong(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_pong(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
-                    }
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
+                }
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
             LastHotLoopScheduler();
         }
         else if constexpr(TailNum == TailNumber::Odd)
         {
             // GEMM loopK
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter     = number<km[number<0>{}]>{};
+                constexpr auto mIter     = number<km[number<1>{}]>{};
+                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WG{}(c_warp_tensor,
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor_ping(nIter)(kIter));
+                    // warp GEMM
+                    WG{}(c_warp_tensor,
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor_ping(nIter)(kIter));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tile.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
-                    }
-
-                    // barrier
-                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+                    // write C warp tensor into C block tensor
+                    c_block_tile.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                }
+
+                // barrier
+                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
             LastHotLoopScheduler();
         }
diff --git a/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 23d7a9fca9..cef66e470f 100644
--- a/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -390,10 +390,6 @@ struct MXFlatmmPipelineAGmemBGmemCRegV1 : FlatmmPipelineAGmemBGmemCRegV1<Problem
                                     ? Aload_rep
                                     : 0;
                 }
-                // if((kIter % KPerScaleLoad == 0) && (mIter == 0))
-                // {
-                //     load_perM = load_perM + 1;
-                // }
                 SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
             }
         }
@@ -486,13 +482,13 @@ struct MXFlatmmPipelineAGmemBGmemCRegV1 : FlatmmPipelineAGmemBGmemCRegV1<Problem
             to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
         auto c_block_tile                   = BlockFlatmm{}.MakeCBlockTile();
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                c_block_tile.set_y_sliced_thread_data(
-                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                    c_warp_tensors(mIter)(nIter).get_thread_buffer());
-            });
+        static_ford<sequence<MIterPerWarp, NIterPerWarp>>{}([&](auto mn) {
+            constexpr auto mIter = number<mn[number<0>{}]>{};
+            constexpr auto nIter = number<mn[number<1>{}]>{};
+            c_block_tile.set_y_sliced_thread_data(
+                merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                c_warp_tensors(mIter)(nIter).get_thread_buffer());
         });
         return c_block_tile;
     }
@@ -643,24 +639,23 @@ struct MXFlatmmPipelineAGmemBGmemCRegV1 : FlatmmPipelineAGmemBGmemCRegV1<Problem
         });
 
         // prefetch Scale A
-        static_for<0, MPackIterPerWarp, 1>{}([&](auto impack) {
-            static_for<0, KPackIterPerWarp, 1>{}([&](auto ikpack) {
-                scale_a_tile_tensor_ping(impack)(ikpack) = load_tile_with_offset(
-                    scale_a_dram_window,
+        static_ford<sequence<MPackIterPerWarp, KPackIterPerWarp>>{}([&](auto ii) {
+            constexpr auto impack = number<ii[number<0>{}]>{};
+            constexpr auto ikpack = number<ii[number<1>{}]>{};
+            scale_a_tile_tensor_ping(impack)(ikpack) =
+                load_tile_with_offset(scale_a_dram_window,
 
-                    impack * scale_a_dram_step_m + ikpack * scale_a_dram_step_k);
-            });
+                                      impack * scale_a_dram_step_m + ikpack * scale_a_dram_step_k);
         });
         // move Scale A window to next K
         move_tile_window(scale_a_dram_window, {0, kKPerBlock / (32 * KXdlPack)});
 
         // prefetch Scale B
-        static_for<0, NPackIterPerWarp, 1>{}([&](auto inpack) {
-            static_for<0, KPackIterPerWarp, 1>{}([&](auto ikpack) {
-                scale_b_tile_tensor_ping(inpack)(ikpack) = load_tile_with_offset(
-                    scale_b_dram_window,
-                    inpack * scale_b_dram_step_n + ikpack * scale_b_dram_step_k);
-            });
+        static_ford<sequence<NPackIterPerWarp, KPackIterPerWarp>>{}([&](auto ii) {
+            constexpr auto inpack                    = number<ii[number<0>{}]>{};
+            constexpr auto ikpack                    = number<ii[number<1>{}]>{};
+            scale_b_tile_tensor_ping(inpack)(ikpack) = load_tile_with_offset(
+                scale_b_dram_window, inpack * scale_b_dram_step_n + ikpack * scale_b_dram_step_k);
         });
         // move Scale B window to next K
         move_tile_window(scale_b_dram_window, {0, kKPerBlock / (32 * KXdlPack)});
@@ -698,34 +693,34 @@ struct MXFlatmmPipelineAGmemBGmemCRegV1 : FlatmmPipelineAGmemBGmemCRegV1<Problem
         // MAIN LOOP
         auto main_body_implx2 = [&]() mutable {
             // prefetch B(2i+1)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile_with_offset(
-                        b_flat_dram_window,
-                        b_flat_dram_offsets(nIter) + kIter * KFlatBytesPerBlockPerIter);
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter             = number<kn[number<0>{}]>{};
+                constexpr auto nIter             = number<kn[number<1>{}]>{};
+                b_warp_tensor_pong(nIter)(kIter) = load_tile_with_offset(
+                    b_flat_dram_window,
+                    b_flat_dram_offsets(nIter) + kIter * KFlatBytesPerBlockPerIter);
 
-                    // move B window to next flat K
-                    if constexpr(kIter == KIterPerWarp - 1)
-                        b_flat_dram_offsets(nIter) += b_flat_dram_window.get_load_offset(
-                            tuple<number<0>, number<KIterPerWarp * KFlatBytesPerBlockPerIter>>{});
-                });
+                // move B window to next flat K
+                if constexpr(kIter == KIterPerWarp - 1)
+                    b_flat_dram_offsets(nIter) += b_flat_dram_window.get_load_offset(
+                        tuple<number<0>, number<KIterPerWarp * KFlatBytesPerBlockPerIter>>{});
             });
 
             // prefetch Scale A and Scale B (2i+1)
-            static_for<0, KPackIterPerWarp, 1>{}([&](auto ikpack) {
-                static_for<0, MPackIterPerWarp, 1>{}([&](auto impack) {
-                    scale_a_tile_tensor_pong(impack)(ikpack) = load_tile_with_offset(
-                        scale_a_dram_window,
-                        impack * scale_a_dram_step_m + ikpack * scale_a_dram_step_k);
-                });
+            static_ford<sequence<KPackIterPerWarp, MPackIterPerWarp>>{}([&](auto ii) {
+                constexpr auto ikpack                    = number<ii[number<0>{}]>{};
+                constexpr auto impack                    = number<ii[number<1>{}]>{};
+                scale_a_tile_tensor_pong(impack)(ikpack) = load_tile_with_offset(
+                    scale_a_dram_window,
+                    impack * scale_a_dram_step_m + ikpack * scale_a_dram_step_k);
             });
 
-            static_for<0, KPackIterPerWarp, 1>{}([&](auto ikpack) {
-                static_for<0, NPackIterPerWarp, 1>{}([&](auto inpack) {
-                    scale_b_tile_tensor_pong(inpack)(ikpack) = load_tile_with_offset(
-                        scale_b_dram_window,
-                        inpack * scale_b_dram_step_n + ikpack * scale_b_dram_step_k);
-                });
+            static_ford<sequence<KPackIterPerWarp, NPackIterPerWarp>>{}([&](auto ii) {
+                constexpr auto ikpack                    = number<ii[number<0>{}]>{};
+                constexpr auto inpack                    = number<ii[number<1>{}]>{};
+                scale_b_tile_tensor_pong(inpack)(ikpack) = load_tile_with_offset(
+                    scale_b_dram_window,
+                    inpack * scale_b_dram_step_n + ikpack * scale_b_dram_step_k);
             });
 
             // GEMM 2i
@@ -788,34 +783,34 @@ struct MXFlatmmPipelineAGmemBGmemCRegV1 : FlatmmPipelineAGmemBGmemCRegV1<Problem
             ////////////////////////////// Next K //////////////////////////////
 
             // prefetch B(2i+2)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_warp_tensor_ping(nIter)(kIter) = load_tile_with_offset(
-                        b_flat_dram_window,
-                        b_flat_dram_offsets(nIter) + kIter * KFlatBytesPerBlockPerIter);
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter             = number<kn[number<0>{}]>{};
+                constexpr auto nIter             = number<kn[number<1>{}]>{};
+                b_warp_tensor_ping(nIter)(kIter) = load_tile_with_offset(
+                    b_flat_dram_window,
+                    b_flat_dram_offsets(nIter) + kIter * KFlatBytesPerBlockPerIter);
 
-                    // move B window to next flat K
-                    if constexpr(kIter == KIterPerWarp - 1)
-                        b_flat_dram_offsets(nIter) += b_flat_dram_window.get_load_offset(
-                            tuple<number<0>, number<KIterPerWarp * KFlatBytesPerBlockPerIter>>{});
-                });
+                // move B window to next flat K
+                if constexpr(kIter == KIterPerWarp - 1)
+                    b_flat_dram_offsets(nIter) += b_flat_dram_window.get_load_offset(
+                        tuple<number<0>, number<KIterPerWarp * KFlatBytesPerBlockPerIter>>{});
             });
 
             // prefetch Scale A and Scale B (2i+2)
-            static_for<0, KPackIterPerWarp, 1>{}([&](auto ikpack) {
-                static_for<0, MPackIterPerWarp, 1>{}([&](auto impack) {
-                    scale_a_tile_tensor_ping(impack)(ikpack) = load_tile_with_offset(
-                        scale_a_dram_window,
-                        impack * scale_a_dram_step_m + ikpack * scale_a_dram_step_k);
-                });
+            static_ford<sequence<KPackIterPerWarp, MPackIterPerWarp>>{}([&](auto ii) {
+                constexpr auto ikpack                    = number<ii[number<0>{}]>{};
+                constexpr auto impack                    = number<ii[number<1>{}]>{};
+                scale_a_tile_tensor_ping(impack)(ikpack) = load_tile_with_offset(
+                    scale_a_dram_window,
+                    impack * scale_a_dram_step_m + ikpack * scale_a_dram_step_k);
             });
 
-            static_for<0, KPackIterPerWarp, 1>{}([&](auto ikpack) {
-                static_for<0, NPackIterPerWarp, 1>{}([&](auto inpack) {
-                    scale_b_tile_tensor_ping(inpack)(ikpack) = load_tile_with_offset(
-                        scale_b_dram_window,
-                        inpack * scale_b_dram_step_n + ikpack * scale_b_dram_step_k);
-                });
+            static_ford<sequence<KPackIterPerWarp, NPackIterPerWarp>>{}([&](auto ii) {
+                constexpr auto ikpack                    = number<ii[number<0>{}]>{};
+                constexpr auto inpack                    = number<ii[number<1>{}]>{};
+                scale_b_tile_tensor_ping(inpack)(ikpack) = load_tile_with_offset(
+                    scale_b_dram_window,
+                    inpack * scale_b_dram_step_n + ikpack * scale_b_dram_step_k);
             });
 
             // GEMM 2i+1
@@ -888,28 +883,28 @@ struct MXFlatmmPipelineAGmemBGmemCRegV1 : FlatmmPipelineAGmemBGmemCRegV1<Problem
         if constexpr(TailNum == TailNumber::Even)
         {
             // prefetch B(loopK)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_warp_tensor_pong(nIter)(kIter) = load_tile_with_offset(
-                        b_flat_dram_window,
-                        b_flat_dram_offsets(nIter) + kIter * KFlatBytesPerBlockPerIter);
-                });
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter             = number<kn[number<0>{}]>{};
+                constexpr auto nIter             = number<kn[number<1>{}]>{};
+                b_warp_tensor_pong(nIter)(kIter) = load_tile_with_offset(
+                    b_flat_dram_window,
+                    b_flat_dram_offsets(nIter) + kIter * KFlatBytesPerBlockPerIter);
             });
 
             // prefetch Scale A and Scale B (2i+1)
-            static_for<0, MPackIterPerWarp, 1>{}([&](auto impack) {
-                static_for<0, KPackIterPerWarp, 1>{}([&](auto ikpack) {
-                    scale_a_tile_tensor_pong(impack)(ikpack) = load_tile_with_offset(
-                        scale_a_dram_window,
-                        impack * scale_a_dram_step_m + ikpack * scale_a_dram_step_k);
-                });
+            static_ford<sequence<MPackIterPerWarp, KPackIterPerWarp>>{}([&](auto ii) {
+                constexpr auto impack                    = number<ii[number<0>{}]>{};
+                constexpr auto ikpack                    = number<ii[number<1>{}]>{};
+                scale_a_tile_tensor_pong(impack)(ikpack) = load_tile_with_offset(
+                    scale_a_dram_window,
+                    impack * scale_a_dram_step_m + ikpack * scale_a_dram_step_k);
             });
-            static_for<0, NPackIterPerWarp, 1>{}([&](auto inpack) {
-                static_for<0, KPackIterPerWarp, 1>{}([&](auto ikpack) {
-                    scale_b_tile_tensor_pong(inpack)(ikpack) = load_tile_with_offset(
-                        scale_b_dram_window,
-                        inpack * scale_b_dram_step_n + ikpack * scale_b_dram_step_k);
-                });
+            static_ford<sequence<NPackIterPerWarp, KPackIterPerWarp>>{}([&](auto ii) {
+                constexpr auto inpack                    = number<ii[number<0>{}]>{};
+                constexpr auto ikpack                    = number<ii[number<1>{}]>{};
+                scale_b_tile_tensor_pong(inpack)(ikpack) = load_tile_with_offset(
+                    scale_b_dram_window,
+                    inpack * scale_b_dram_step_n + ikpack * scale_b_dram_step_k);
             });
 
             // GEMM loopK-1
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
index 53934ebcd3..c6628f66be 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -484,20 +484,6 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
             kargs.init_logits_soft_cap(logits_soft_cap);
         }
 
-        // Check that the maximum offset won't overflow.
-        if constexpr(kPageBlockSize < FmhaPipeline::kN0)
-        {
-            if(num_total_pages > 1)
-            {
-                assert(static_cast<int64_t>(num_total_pages - 1) * batch_stride_k <=
-                           static_cast<int64_t>(std::numeric_limits<index_t>::max()) &&
-                       "KV cache K offset overflow: exceed int32 max");
-                assert(static_cast<int64_t>(num_total_pages - 1) * batch_stride_v <=
-                           static_cast<int64_t>(std::numeric_limits<index_t>::max()) &&
-                       "KV cache V offset overflow: exceed int32 max");
-            }
-        }
-
         return kargs;
     }
 
@@ -651,20 +637,6 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
             kargs.init_logits_soft_cap(logits_soft_cap);
         }
 
-        // Check that the maximum offset won't overflow.
-        if constexpr(kPageBlockSize < FmhaPipeline::kN0)
-        {
-            if(num_total_pages > 1)
-            {
-                assert(static_cast<int64_t>(num_total_pages - 1) * batch_stride_k <=
-                           static_cast<int64_t>(std::numeric_limits<index_t>::max()) &&
-                       "KV cache K offset overflow: exceed int32 max");
-                assert(static_cast<int64_t>(num_total_pages - 1) * batch_stride_v <=
-                           static_cast<int64_t>(std::numeric_limits<index_t>::max()) &&
-                       "KV cache V offset overflow: exceed int32 max");
-            }
-        }
-
         return kargs;
     }
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index d32d5a321d..e9f0258710 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -872,7 +872,7 @@ struct FmhaBwdDQDKDVKernel
             }
 
             // skip if logical lengths are zero
-            if(kargs.seqlen_q == 0 || kargs.seqlen_k == 0)
+            if(kargs.seqlen_q == 0 && kargs.seqlen_k == 0)
             {
                 return;
             }
@@ -1324,6 +1324,7 @@ struct FmhaBwdOGradDotOKernel
     using DDataType     = ck_tile::remove_cvref_t<typename FmhaBwdOGradDotO::DDataType>;
     using ODataType     = ck_tile::remove_cvref_t<typename FmhaBwdOGradDotO::ODataType>;
     using OGradDataType = ck_tile::remove_cvref_t<typename FmhaBwdOGradDotO::OGradDataType>;
+    using LSEDataType   = ck_tile::remove_cvref_t<typename FmhaBwdOGradDotO::LSEDataType>;
 
     static constexpr bool kIsGroupMode = FmhaBwdOGradDotO::kIsGroupMode;
     static constexpr bool kPadSeqLenQ  = FmhaBwdOGradDotO::kPadSeqLenQ;
@@ -1365,25 +1366,31 @@ struct FmhaBwdOGradDotOKernel
         const void* o_ptr;
         const void* do_ptr;
         void* d_ptr;
+        const void* lse_ptr; // log-sum-exp from forward pass, shape [batch, nhead, seqlen_q]
+        const LSEDataType* sink_ptr; // sink scores, shape [batch, nhead]; nullptr disables sink
+        LSEDataType* d_sink_ptr; // sink gradient output, shape [nhead]; nullptr disables sink grad
 
         float p_undrop;
 
         ck_tile::index_t seqlen_q;
         ck_tile::index_t hdim_v;
+        ck_tile::index_t nhead; // used to index sink_ptr / d_sink_ptr
 
         ck_tile::index_t stride_do;
         ck_tile::index_t stride_o;
 
         ck_tile::index_t nhead_stride_do;
         ck_tile::index_t nhead_stride_o;
-        ck_tile::index_t nhead_stride_d;
+        // LSE and D always share the same layout; this stride covers both.
+        ck_tile::index_t nhead_stride_lsed;
     };
 
     struct FmhaBwdOGradDotOBatchModeKargs : FmhaBwdOGradDotOCommonKargs
     {
         ck_tile::index_t batch_stride_do;
         ck_tile::index_t batch_stride_o;
-        ck_tile::index_t batch_stride_d;
+        // LSE and D always share the same layout; this stride covers both.
+        ck_tile::index_t batch_stride_lsed;
     };
 
     struct FmhaBwdOGradDotOGroupModeKargs : FmhaBwdOGradDotOCommonKargs
@@ -1401,32 +1408,40 @@ struct FmhaBwdOGradDotOKernel
     MakeKargs(const void* o_ptr,
               const void* do_ptr,
               void* d_ptr,
+              const void* lse_ptr,
+              const void* sink_ptr,
+              void* d_sink_ptr,
               float p_undrop,
               ck_tile::index_t seqlen_q,
               ck_tile::index_t hdim_v,
+              ck_tile::index_t nhead,
               ck_tile::index_t stride_do,
               ck_tile::index_t stride_o,
               ck_tile::index_t nhead_stride_do,
               ck_tile::index_t nhead_stride_o,
-              ck_tile::index_t nhead_stride_d,
+              ck_tile::index_t nhead_stride_lsed,
               ck_tile::index_t batch_stride_do,
               ck_tile::index_t batch_stride_o,
-              ck_tile::index_t batch_stride_d)
+              ck_tile::index_t batch_stride_lsed)
     {
         Kargs kargs{{o_ptr,
                      do_ptr,
                      d_ptr,
+                     lse_ptr,
+                     reinterpret_cast<const LSEDataType*>(sink_ptr),
+                     reinterpret_cast<LSEDataType*>(d_sink_ptr),
                      p_undrop,
                      seqlen_q,
                      hdim_v,
+                     nhead,
                      stride_do,
                      stride_o,
                      nhead_stride_do,
                      nhead_stride_o,
-                     nhead_stride_d},
+                     nhead_stride_lsed},
                     batch_stride_do,
                     batch_stride_o,
-                    batch_stride_d};
+                    batch_stride_lsed};
 
         return kargs;
     }
@@ -1436,28 +1451,36 @@ struct FmhaBwdOGradDotOKernel
     MakeKargs(const void* o_ptr,
               const void* do_ptr,
               void* d_ptr,
+              const void* lse_ptr,
+              const void* sink_ptr,
+              void* d_sink_ptr,
               float p_undrop,
               const void* seqstart_q_ptr,
               const void* seqlen_q_ptr,
               const void* cu_seqlen_q_ptr,
               ck_tile::index_t hdim_v,
+              ck_tile::index_t nhead,
               ck_tile::index_t stride_do,
               ck_tile::index_t stride_o,
               ck_tile::index_t nhead_stride_do,
               ck_tile::index_t nhead_stride_o,
-              ck_tile::index_t nhead_stride_d)
+              ck_tile::index_t nhead_stride_lsed)
     {
         Kargs kargs{{o_ptr,
                      do_ptr,
                      d_ptr,
+                     lse_ptr,
+                     reinterpret_cast<const LSEDataType*>(sink_ptr),
+                     reinterpret_cast<LSEDataType*>(d_sink_ptr),
                      p_undrop,
                      -1, // seqlen will be updated by another pointer
                      hdim_v,
+                     nhead,
                      stride_do,
                      stride_o,
                      nhead_stride_do,
                      nhead_stride_o,
-                     nhead_stride_d},
+                     nhead_stride_lsed},
                     reinterpret_cast<const int32_t*>(seqstart_q_ptr),
                     reinterpret_cast<const int32_t*>(seqlen_q_ptr),
                     reinterpret_cast<const int32_t*>(cu_seqlen_q_ptr)};
@@ -1491,18 +1514,18 @@ struct FmhaBwdOGradDotOKernel
 
         const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
 
-        long_index_t batch_offset_o  = 0;
-        long_index_t batch_offset_do = 0;
-        long_index_t batch_offset_d  = 0;
+        long_index_t batch_offset_o    = 0;
+        long_index_t batch_offset_do   = 0;
+        long_index_t batch_offset_lsed = 0;
 
         if constexpr(kIsGroupMode)
         {
             // get starting offset for each batch
             const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
 
-            batch_offset_o  = query_start * kargs.stride_o;
-            batch_offset_do = query_start * kargs.stride_do;
-            batch_offset_d  = query_start;
+            batch_offset_o    = query_start * kargs.stride_o;
+            batch_offset_do   = query_start * kargs.stride_do;
+            batch_offset_lsed = query_start;
 
             // Priority: cu_seqlen_q_ptr > seqlen_q_ptr > physical_seqlen_q
             if(kargs.cu_seqlen_q_ptr != nullptr)
@@ -1530,11 +1553,20 @@ struct FmhaBwdOGradDotOKernel
         }
         else
         {
-            batch_offset_o  = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
-            batch_offset_do = static_cast<long_index_t>(i_batch) * kargs.batch_stride_do;
-            batch_offset_d  = static_cast<long_index_t>(i_batch) * kargs.batch_stride_d;
+            batch_offset_o    = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+            batch_offset_do   = static_cast<long_index_t>(i_batch) * kargs.batch_stride_do;
+            batch_offset_lsed = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lsed;
         }
 
+        // Read per-head sink score and convert to log2 domain so the pipeline can use exp2.
+        // Pre-multiply by log2e so that exp2(sink_value - log2e*lse) == exp(raw_sink - lse).
+        // -inf is left unchanged (log2e * -inf == -inf) to keep P_sink -> 0 when sink is disabled.
+        const LSEDataType sink_value =
+            kargs.sink_ptr != nullptr
+                ? log2e_v<LSEDataType> *
+                      kargs.sink_ptr[static_cast<long_index_t>(i_batch) * kargs.nhead + i_nhead]
+                : -numeric<LSEDataType>::infinity();
+
         // for simplicity, batch stride we just modify the pointer
         const ODataType* o_ptr = reinterpret_cast<const ODataType*>(kargs.o_ptr) +
                                  static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
@@ -1542,9 +1574,13 @@ struct FmhaBwdOGradDotOKernel
         const OGradDataType* do_ptr = reinterpret_cast<const OGradDataType*>(kargs.do_ptr) +
                                       static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_do +
                                       batch_offset_do;
+        const LSEDataType* lse_ptr = reinterpret_cast<const LSEDataType*>(kargs.lse_ptr) +
+                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_lsed +
+                                     batch_offset_lsed;
+
         DDataType* d_ptr = reinterpret_cast<DDataType*>(kargs.d_ptr) +
-                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_d +
-                           batch_offset_d;
+                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_lsed +
+                           batch_offset_lsed;
 
         // O/dO/D DRAM and DRAM window
         const auto o_dram = [&]() {
@@ -1578,13 +1614,31 @@ struct FmhaBwdOGradDotOKernel
 
         auto o_dram_window =
             make_tile_window(o_dram, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {i_m0, 0});
-
         auto do_dram_window =
             make_tile_window(do_dram, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {i_m0, 0});
-
         auto d_dram_window = make_tile_window(d_dram, make_tuple(number<kM0>{}), {i_m0});
 
-        FmhaBwdOGradDotO{}(o_dram_window, do_dram_window, d_dram_window, kargs.p_undrop);
+        // nullptr when sink grad is disabled; the pipeline checks this to skip the sink path
+        LSEDataType* atomic_sink_grad_ptr =
+            kargs.d_sink_ptr == nullptr ? nullptr : kargs.d_sink_ptr + i_nhead;
+
+        // lse_ptr is always valid (also needed by the main bwd kernel).
+        // The actual load happens inside the pipeline only when atomic_sink_grad_ptr != nullptr.
+        auto lse_dram = [&]() {
+            const auto lse_dram_naive = make_naive_tensor_view_packed<address_space_enum::global>(
+                lse_ptr, make_tuple(kargs.seqlen_q), number<1>{});
+            return pad_tensor_view(
+                lse_dram_naive, make_tuple(number<kM0>{}), sequence<kPadSeqLenQ>{});
+        }();
+        auto lse_dram_window = make_tile_window(lse_dram, make_tuple(number<kM0>{}), {i_m0});
+
+        FmhaBwdOGradDotO{}(o_dram_window,
+                           do_dram_window,
+                           lse_dram_window,
+                           d_dram_window,
+                           sink_value,
+                           kargs.p_undrop,
+                           atomic_sink_grad_ptr);
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
index 6fe1de634d..8ee9b9d9b7 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
@@ -27,6 +27,7 @@ struct FmhaFwdV3Kernel
     using QDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::QDataType>;
     using KDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::KDataType>;
     using VDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::VDataType>;
+    using PDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::PDataType>;
     using LSEDataType  = ck_tile::remove_cvref_t<typename FmhaPipeline::LSEDataType>;
     using ODataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::ODataType>;
     using SaccDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::SaccDataType>;
@@ -38,6 +39,7 @@ struct FmhaFwdV3Kernel
     static constexpr bool kPadHeadDimV      = FmhaPipeline::kPadHeadDimV;
     static constexpr bool kHasLogitsSoftCap = FmhaPipeline::kHasLogitsSoftCap;
     static constexpr bool kStoreLSE         = FmhaPipeline::kStoreLSE;
+    static constexpr auto QScaleEnum        = FmhaPipeline::Problem::QScaleEnum;
 
     using AttentionVariant = ck_tile::remove_cvref_t<typename FmhaPipeline::AttentionVariant>;
     using FmhaMask         = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
@@ -118,11 +120,21 @@ struct FmhaFwdV3Kernel
         float logits_soft_cap_rcp;
     };
 
+    struct FmhaFwdCommonQScaleKargs
+    {
+        const void* q_descale_ptr = nullptr;
+        const void* k_descale_ptr = nullptr;
+        const void* v_descale_ptr = nullptr;
+    };
+
     struct FmhaFwdBatchModeKargs
         : FmhaFwdCommonKargs,
           std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<0>>,
           std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<1>>,
-          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<2>>
+          std::conditional_t<QScaleEnum == BlockAttentionQuantScaleEnum::PERTENSOR,
+                             FmhaFwdCommonQScaleKargs,
+                             FmhaFwdEmptyKargs<2>>,
+          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<3>>
     {
         ck_tile::index_t batch_stride_q;
         ck_tile::index_t batch_stride_k;
@@ -139,7 +151,10 @@ struct FmhaFwdV3Kernel
         : FmhaFwdCommonKargs,
           std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<0>>,
           std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<1>>,
-          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<2>>
+          std::conditional_t<QScaleEnum == BlockAttentionQuantScaleEnum::PERTENSOR,
+                             FmhaFwdCommonQScaleKargs,
+                             FmhaFwdEmptyKargs<2>>,
+          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<3>>
     {
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
@@ -166,6 +181,9 @@ struct FmhaFwdV3Kernel
     MakeKargs(const void* q_ptr,
               const void* k_ptr,
               const void* v_ptr,
+              const void* q_descale_ptr,
+              const void* k_descale_ptr,
+              const void* v_descale_ptr,
               void* lse_ptr,
               void* o_ptr,
               ck_tile::index_t seqlen_q,
@@ -218,6 +236,7 @@ struct FmhaFwdV3Kernel
                      nhead_stride_o}, // args for common karg
                     {},               // placeholder for mask
                     {},               // placeholder for lse
+                    {},               // placeholder for qscale
                     {},               // placeholder for logits_soft_cap
                     batch_stride_q,
                     batch_stride_k,
@@ -237,6 +256,12 @@ struct FmhaFwdV3Kernel
             kargs.nhead_stride_lse = nhead_stride_lse;
             kargs.batch_stride_lse = batch_stride_lse;
         }
+        if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::PERTENSOR)
+        {
+            kargs.q_descale_ptr = q_descale_ptr;
+            kargs.k_descale_ptr = k_descale_ptr;
+            kargs.v_descale_ptr = v_descale_ptr;
+        }
         if constexpr(kHasLogitsSoftCap)
         {
             kargs.init_logits_soft_cap(logits_soft_cap);
@@ -252,6 +277,9 @@ struct FmhaFwdV3Kernel
     MakeKargs(const void* q_ptr,
               const void* k_ptr,
               const void* v_ptr,
+              const void* q_descale_ptr,
+              const void* k_descale_ptr,
+              const void* v_descale_ptr,
               void* lse_ptr,
               void* o_ptr,
               const void* seqstart_q_ptr,
@@ -301,6 +329,7 @@ struct FmhaFwdV3Kernel
                      nhead_stride_o}, // args for common karg
                     {},               // placeholder for mask
                     {},               // placeholder for lse
+                    {},               // placeholder for qscale
                     {},               // placeholder for logits_soft_cap
                     reinterpret_cast<const int32_t*>(seqstart_q_ptr),
                     reinterpret_cast<const int32_t*>(seqstart_k_ptr),
@@ -319,6 +348,12 @@ struct FmhaFwdV3Kernel
             kargs.lse_ptr          = lse_ptr;
             kargs.nhead_stride_lse = nhead_stride_lse;
         }
+        if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::PERTENSOR)
+        {
+            kargs.q_descale_ptr = q_descale_ptr;
+            kargs.k_descale_ptr = k_descale_ptr;
+            kargs.v_descale_ptr = v_descale_ptr;
+        }
         if constexpr(kHasLogitsSoftCap)
         {
             kargs.init_logits_soft_cap(logits_soft_cap);
@@ -437,8 +472,19 @@ struct FmhaFwdV3Kernel
     {
         using namespace ck_tile;
 
-        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
+        // Notice: When using double buffering, make sure both buffers are in the same array.
+        // This prevents the compiler from using separate VGPRs to store the base address
+        // and enables the use of immediate offsets in load/store instructions.
+        constexpr auto smem_size_kv =
+            FmhaPipeline::Policy::template GetSmemSizeKV<typename FmhaPipeline::Problem>();
+        __shared__ char smem_k[2][smem_size_kv];
+        __shared__ char smem_v[2][smem_size_kv];
+
+        auto* smem_k0 = reinterpret_cast<KDataType*>(smem_k[0]);
+        auto* smem_k1 = reinterpret_cast<KDataType*>(smem_k[1]);
+        auto* smem_v0 = reinterpret_cast<VDataType*>(smem_v[0]);
+        auto* smem_v1 = reinterpret_cast<VDataType*>(smem_v[1]);
+        ;
 
         // divide problem
         const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
@@ -640,32 +686,88 @@ struct FmhaFwdV3Kernel
                 return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
         }();
 
+        const float scale_s = [&] {
+            if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::PERTENSOR)
+            {
+                float q_descale = *(reinterpret_cast<const float*>(kargs.q_descale_ptr));
+                float k_descale = *(reinterpret_cast<const float*>(kargs.k_descale_ptr));
+                return kargs.scale_s * q_descale * k_descale;
+            }
+            else
+            {
+                return kargs.scale_s;
+            }
+        }();
+
         AttentionVariant variant;
         const auto variant_params = [&] {
             if constexpr(kHasLogitsSoftCap)
             {
                 return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
-                    mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
+                    mask, scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
             }
             else
             {
-                return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
+                return ck_tile::StandardAttentionParams<FmhaMask>{mask, scale_s};
             }
         }();
 
         BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
 
         auto o_acc_tile = [&]() {
-            return FmhaPipeline{}(q_dram_window,
-                                  k_dram_window,
-                                  v_dram_window,
-                                  lse_dram_window,
-                                  mask,
-                                  kargs.scale_s,
-                                  variant,
-                                  variant_params,
-                                  block_indices,
-                                  smem_ptr);
+            if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::PERTENSOR)
+            {
+                float v_descale = *(reinterpret_cast<const float*>(kargs.v_descale_ptr));
+                float scale_p   = ck_tile::type_convert<float>(ck_tile::numeric<PDataType>::max());
+                float scale_o   = v_descale / scale_p;
+
+                auto o_acc_element_func = [&]() {
+                    if constexpr(std::is_same_v<ODataType, ck_tile::fp8_t>)
+                        return make_composes(
+                            ck_tile::saturates<ck_tile::fp8_t>{},
+                            ck_tile::scales<remove_cvref_t<decltype(scale_o)>>{scale_o});
+                    else
+                        return ck_tile::scales<remove_cvref_t<decltype(scale_o)>>{scale_o};
+                }();
+
+                return FmhaPipeline{}(
+                    q_dram_window,
+                    identity{}, // q_element_func
+                    k_dram_window,
+                    identity{}, // k_element_func
+                    v_dram_window,
+                    identity{}, // v_element_func
+                    lse_dram_window,
+                    identity{},                                         // lse_element_func
+                    identity{},                                         // s_acc_element_func
+                    scales<remove_cvref_t<decltype(scale_p)>>{scale_p}, // p_compute_element_func
+                    o_acc_element_func,
+                    mask,
+                    scale_s,
+                    variant,
+                    variant_params,
+                    block_indices,
+                    smem_k0,
+                    smem_k1,
+                    smem_v0,
+                    smem_v1);
+            }
+            else
+            {
+                return FmhaPipeline{}(q_dram_window,
+                                      k_dram_window,
+                                      v_dram_window,
+                                      lse_dram_window,
+                                      mask,
+                                      scale_s,
+                                      variant,
+                                      variant_params,
+                                      block_indices,
+                                      smem_k0,
+                                      smem_k1,
+                                      smem_v0,
+                                      smem_v1);
+            }
         }();
 
         // O DRAM and O DRAM window
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp
index f01d681002..1cc40fdaa9 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp
@@ -14,6 +14,7 @@ struct BlockFmhaBwdOGradDotO
     using ODataType     = remove_cvref_t<typename Problem::ODataType>;
     using OGradDataType = remove_cvref_t<typename Problem::OGradDataType>;
     using DDataType     = remove_cvref_t<typename Problem::DDataType>;
+    using LSEDataType   = remove_cvref_t<typename Problem::LSEDataType>; // needed for sink gradient
 
     static constexpr index_t kBlockPerCu = Problem::kBlockPerCu;
     static constexpr index_t kBlockSize  = Problem::kBlockSize;
@@ -32,11 +33,18 @@ struct BlockFmhaBwdOGradDotO
 
     template <typename ODramBlockWindowTmp,
               typename OGradDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
               typename DDramBlockWindowTmp>
+    // Computes D = diag(dO * O) and optionally accumulates the sink token gradient.
+    // sink_value: log-space sink score; pass -inf and atomic_sink_grad_ptr=nullptr to skip sink.
+    // atomic_sink_grad_ptr: per-head accumulator in global memory; nullptr disables sink path.
     CK_TILE_HOST_DEVICE void operator()(const ODramBlockWindowTmp& o_dram_block_window_tmp,
                                         const OGradDramBlockWindowTmp& do_dram_block_window_tmp,
+                                        const LSEDramBlockWindowTmp& lse_dram_block_window_tmp,
                                         DDramBlockWindowTmp& d_dram_block_window_tmp,
-                                        float p_undrop) const
+                                        const LSEDataType sink_value,
+                                        float p_undrop,
+                                        LSEDataType* atomic_sink_grad_ptr = nullptr) const
     {
         static_assert(
             std::is_same_v<ODataType, remove_cvref_t<typename ODramBlockWindowTmp::DataType>> &&
@@ -44,6 +52,10 @@ struct BlockFmhaBwdOGradDotO
                                remove_cvref_t<typename OGradDramBlockWindowTmp::DataType>> &&
                 std::is_same_v<DDataType, remove_cvref_t<typename DDramBlockWindowTmp::DataType>>,
             "wrong!");
+        // atomic_sink_grad_ptr is reinterpret_cast to float* in the sink path;
+        // ensure LSEDataType is float so the cast is well-defined.
+        static_assert(std::is_same_v<LSEDataType, float>,
+                      "sink gradient atomicAdd requires LSEDataType == float");
 
         static_assert(kBlockSize == ODramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
                           kBlockSize ==
@@ -67,14 +79,13 @@ struct BlockFmhaBwdOGradDotO
 
         auto do_ = load_tile(do_dram_window);
 
-        // declare d
+        // D[q] = sum_j(O[q,j] * dO[q,j]), used in softmax backward
         constexpr auto d_dstr =
             make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding(
                 o.get_tile_distribution().get_static_tile_distribution_encoding(), sequence<1>{}));
 
         auto d = make_static_distributed_tensor<DDataType>(d_dstr);
-
-        clear_tile(d); // Initialize D
+        clear_tile(d);
 
         constexpr auto o_spans = decltype(o)::get_distributed_spans();
         sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
@@ -86,9 +97,67 @@ struct BlockFmhaBwdOGradDotO
             });
         });
 
+        // Scale by p_undrop (=1 when dropout is disabled)
         tile_elementwise_inout([&p_undrop](auto& x) { x = x * p_undrop; }, d);
 
         store_tile(d_dram_block_window_tmp, d);
+
+        // Sink gradient path: skipped entirely when atomic_sink_grad_ptr is nullptr
+        if(atomic_sink_grad_ptr != nullptr)
+        {
+            // Load LSE only on the sink path to avoid unnecessary global memory reads
+            constexpr auto lse_dstr =
+                make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding(
+                    o.get_tile_distribution().get_static_tile_distribution_encoding(),
+                    sequence<1>{}));
+            auto lse_dram_window =
+                make_tile_window(lse_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 lse_dram_block_window_tmp.get_window_lengths(),
+                                 lse_dram_block_window_tmp.get_window_origin(),
+                                 lse_dstr);
+            auto lse_ = load_tile(lse_dram_window);
+
+            // Compute per-query contribution: -P_sink[q] * D[q]
+            // where P_sink[q] = exp2(sink_value - log2e*lse[q])
+            // sink_value has already been pre-multiplied by log2e at the kernel call site,
+            // so exp2(sink_value - log2e*lse) == exp(raw_sink - lse).
+            // exp2 maps directly to the v_exp_f32 hardware instruction on AMD GPUs.
+            // Always accumulate in float regardless of DDataType to avoid precision loss
+            // and to ensure atomicAdd works correctly on all architectures.
+            auto sink_val_tensor = make_static_distributed_tensor<float>(d_dstr);
+            tile_elementwise_inout(
+                [&](auto& s_out, const auto& l_in, const auto& d_in) {
+                    float p_sink = exp2(type_convert<float>(sink_value) -
+                                        log2e_v<float> * type_convert<float>(l_in));
+                    s_out        = -p_sink * type_convert<float>(d_in);
+                },
+                sink_val_tensor,
+                lse_,
+                d);
+
+            // Reduce contributions held by this thread
+            float thread_sum       = 0.f;
+            constexpr auto s_spans = decltype(sink_val_tensor)::get_distributed_spans();
+            sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                thread_sum += sink_val_tensor(i_idx);
+            });
+
+            // Warp-level reduction: fold thread_sum across lanes so only one
+            // atomicAdd per warp is issued instead of one per thread.
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+            const index_t warp_sz = get_warp_size();
+            for(index_t offset = warp_sz >> 1; offset > 0; offset >>= 1)
+                thread_sum += warp_shuffle_down(thread_sum, offset);
+
+            // Only lane 0 of each warp writes to global memory.
+            // Note: this atomicAdd is non-deterministic across runs regardless of the
+            // -deterministic flag, because d_sink is a single scalar per head accumulated
+            // across all thread-blocks. The practical impact is negligible for this value.
+            if(get_lane_id() == 0)
+                atomicAdd(reinterpret_cast<float*>(atomic_sink_grad_ptr), thread_sum);
+#endif
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
index e4332df930..f4cc4bf3e7 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
@@ -159,17 +159,14 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
         const auto [seqlen_q_start, seqlen_q_end] =
             mask.GetTileRangeAlongY(k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
 
-        const auto num_total_loop = integer_divide_ceil(seqlen_q_end - seqlen_q_start, kM0);
+        const auto num_total_loop =
+            amd_wave_read_first_lane(integer_divide_ceil(seqlen_q_end - seqlen_q_start, kM0));
 
-        // check early exit if masked and no work to do.
-        if constexpr(FmhaMask::IsMasking)
+        // check early exit if no work to do.
+        if(num_total_loop <= 0)
         {
-            if(num_total_loop <= 0)
-            {
-                // Note: here dk_acc&dv_acc are all cleard, return it
-                // Note: v loaded but no fence, ignore it.
-                return make_tuple(dk_acc, dv_acc);
-            }
+            // Note: here dk_acc&dv_acc are all cleared, return it
+            return make_tuple(dk_acc, dv_acc);
         }
         KDataType* k_lds_ptr =
             static_cast<KDataType*>(static_cast<void*>(static_cast<char*>(smem_ptr)));
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
index 03ee1486da..34ba8d6c47 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
@@ -159,17 +159,16 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
         const auto [seqlen_q_start, seqlen_q_end] =
             mask.GetTileRangeAlongY(k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
 
-        const auto num_total_loop = integer_divide_ceil(seqlen_q_end - seqlen_q_start, kM0);
+        const auto num_total_loop =
+            amd_wave_read_first_lane(integer_divide_ceil(seqlen_q_end - seqlen_q_start, kM0));
 
-        // check early exit if masked and no work to do.
-        if constexpr(FmhaMask::IsMasking)
+        // check early exit if no work to do.
+        // __builtin_expect is load-bearing: omitting it causes incorrect AGPR allocation in
+        // the dK/dV accumulation loop on some compiler versions, leading to wrong results.
+        if(__builtin_expect(num_total_loop <= 0, 0))
         {
-            if(num_total_loop <= 0)
-            {
-                // Note: here dk_acc&dv_acc are all cleard, return it
-                // Note: v loaded but no fence, ignore it.
-                return make_tuple(dk_acc, dv_acc);
-            }
+            // Note: here dk_acc&dv_acc are all cleared, return it
+            return make_tuple(dk_acc, dv_acc);
         }
         KDataType* k_lds_ptr =
             static_cast<KDataType*>(static_cast<void*>(static_cast<char*>(smem_ptr)));
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
index 7f893a93ba..b65d8ec8f5 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
@@ -245,17 +245,14 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
         const auto [seqlen_q_start, seqlen_q_end] =
             mask.GetTileRangeAlongY(k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
 
-        const auto num_total_loop = integer_divide_ceil(seqlen_q_end - seqlen_q_start, kM0);
+        const auto num_total_loop =
+            amd_wave_read_first_lane(integer_divide_ceil(seqlen_q_end - seqlen_q_start, kM0));
 
-        // check early exit if masked and no work to do.
-        if constexpr(FmhaMask::IsMasking)
+        // check early exit if no work to do.
+        if(num_total_loop <= 0)
         {
-            if(num_total_loop <= 0)
-            {
-                // Note: here dk_acc&dv_acc are all cleard, return it
-                // Note: v loaded but no fence, ignore it.
-                return make_tuple(dk_acc, dv_acc);
-            }
+            // Note: here dk_acc&dv_acc are all cleared, return it
+            return make_tuple(dk_acc, dv_acc);
         }
 
         auto k_lds = make_tensor_view<address_space_enum::lds>(
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index e67a525ac4..bb3fa8c411 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -1706,22 +1706,22 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
             constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    p_warp_tensor.get_thread_buffer() = p_in.get_y_sliced_thread_data(
-                        merge_sequences(sequence<kIter, mIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter              = number<km[number<0>{}]>{};
+                constexpr auto mIter              = number<km[number<1>{}]>{};
+                p_warp_tensor.get_thread_buffer() = p_in.get_y_sliced_thread_data(
+                    merge_sequences(sequence<kIter, mIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
 #if defined(__gfx11__)
-                    PermuteWarpGemmCToA(pt_warp_tensor, p_warp_tensor);
+                PermuteWarpGemmCToA(pt_warp_tensor, p_warp_tensor);
 #else
-                    pt_warp_tensor.get_thread_buffer() = p_warp_tensor.get_thread_buffer();
+                pt_warp_tensor.get_thread_buffer() = p_warp_tensor.get_thread_buffer();
 #endif
-                    pt_out.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths),
-                        pt_warp_tensor.get_thread_buffer());
-                });
+                pt_out.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths),
+                    pt_warp_tensor.get_thread_buffer());
             });
         }
         else
@@ -1763,22 +1763,22 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
             constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    ds_warp_tensor.get_thread_buffer() = ds_in.get_y_sliced_thread_data(
-                        merge_sequences(sequence<kIter, mIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter               = number<km[number<0>{}]>{};
+                constexpr auto mIter               = number<km[number<1>{}]>{};
+                ds_warp_tensor.get_thread_buffer() = ds_in.get_y_sliced_thread_data(
+                    merge_sequences(sequence<kIter, mIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
 #if defined(__gfx11__)
-                    PermuteWarpGemmCToA(dst_warp_tensor, ds_warp_tensor);
+                PermuteWarpGemmCToA(dst_warp_tensor, ds_warp_tensor);
 #else
-                    dst_warp_tensor.get_thread_buffer() = ds_warp_tensor.get_thread_buffer();
+                dst_warp_tensor.get_thread_buffer() = ds_warp_tensor.get_thread_buffer();
 #endif
-                    dst_out.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths),
-                        dst_warp_tensor.get_thread_buffer());
-                });
+                dst_out.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths),
+                    dst_warp_tensor.get_thread_buffer());
             });
         }
         else
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
index a67d727077..d66ce4311e 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
@@ -67,6 +67,7 @@ struct BlockFmhaBwdPipelineProblem
 template <typename ODataType_,
           typename OGradDataType_,
           typename DDataType_,
+          typename LSEDataType_,
           index_t kBlockSize_,
           index_t kVHeaddim_,
           bool kIsGroupMode_,
@@ -76,6 +77,7 @@ struct BlockFmhaBwdOGradDotOPipelineProblem
     using ODataType     = remove_cvref_t<ODataType_>;
     using OGradDataType = remove_cvref_t<OGradDataType_>;
     using DDataType     = remove_cvref_t<DDataType_>;
+    using LSEDataType   = remove_cvref_t<LSEDataType_>;
     using Traits        = remove_cvref_t<Traits_>;
 
     static_assert(0 < kBlockSize_ && kBlockSize_ % get_warp_size() == 0,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp
index 463f149a65..ac868ce4b8 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp
@@ -24,183 +24,201 @@
 #define CK_TILE_DISABLE_PACKED_FP32 0
 #endif
 
-#define WARP_ID 0
-#define LANE_ID 0
-
-#define ENABLE_DEBUG_STMTS 1
-#if ENABLE_DEBUG_STMTS
-#define DEBUG_STMTS \
-    if(get_block_1d_id() == 0 && get_warp_id() == WARP_ID && get_lane_id() == LANE_ID)
-#else
-#define DEBUG_STMTS if constexpr(false)
-#endif
-
 namespace ck_tile {
 
-template <typename PipelineProblem, bool kIsMasking>
-struct CoreLoopScheduler;
+// ---------------------------------------------------------------------------
+// block_gemm_mfma_count_v: number of hardware MFMA instructions issued per
+// warp in one full BlockGemm call.
+//
+//   warp gemm calls = MIterPerWarp * NIterPerWarp * KIterPerWarp
+//   MFMAs per call  = WarpGemm::kK / WarpGemm::WarpGemmAttribute::Impl::kK  (kKIter)
+//
+// For bf16/fp16 kKIter=1; for fp8 kKIter=2 (K=32 warp gemm wraps 2× K=16 MFMA).
+// ---------------------------------------------------------------------------
+template <typename BlockGemm>
+static constexpr ck_tile::index_t block_gemm_mfma_count_v =
+    BlockGemm::MIterPerWarp * BlockGemm::NIterPerWarp * BlockGemm::KIterPerWarp *
+    (BlockGemm::WarpGemm::kK / BlockGemm::WarpGemm::WarpGemmAttribute::Impl::kK);
 
-template <typename PipelineProblem>
-struct CoreLoopScheduler<PipelineProblem, /*kIsMasking=*/true>
+// ---------------------------------------------------------------------------
+// CoreLoopSchedulingParams: auto-derived instruction counts from tile/gemm config
+// ---------------------------------------------------------------------------
+template <typename PipelineProblem, typename Policy = BlockFmhaV3PipelineDefaultPolicy>
+struct CoreLoopSchedulingParams
 {
+    using QKBlockGemm =
+        ck_tile::remove_cvref_t<decltype(Policy::template GetQKBlockGemm<PipelineProblem>())>;
+    using PVBlockGemm =
+        ck_tile::remove_cvref_t<decltype(Policy::template GetPVBlockGemm<PipelineProblem>())>;
+
+    static constexpr ck_tile::index_t kMfmaPerWarpGemm0 = block_gemm_mfma_count_v<QKBlockGemm>;
+    static constexpr ck_tile::index_t kMfmaPerWarpGemm1 = block_gemm_mfma_count_v<PVBlockGemm>;
+
+    static constexpr bool kIsMasking = PipelineProblem::FmhaMask::IsMasking;
+};
+
+// ---------------------------------------------------------------------------
+// CoreLoopSchedulerDefaultBase: reusable phase helpers (bf16/fp16 pattern)
+// ---------------------------------------------------------------------------
+template <typename PipelineProblem>
+struct CoreLoopSchedulerDefaultBase
+{
+    using Params = CoreLoopSchedulingParams<PipelineProblem>;
+
+    // Phase helper: GEMM0 compute (QK matmul) — MFMA interleaved with TRANS + VALU
+    CK_TILE_DEVICE static constexpr void schedule_gemm0_compute()
+    {
+        static_for<0, Params::kMfmaPerWarpGemm0, 1>{}([&](auto) {
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::MFMA, 1, 0);
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::TRANS, 2, 0);
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 2, 0);
+        });
+    }
+
+    // Phase helper: GEMM1 compute (PV matmul) — optional packed-FP32 preamble + MFMA/VALU
+    CK_TILE_DEVICE static constexpr void schedule_gemm1_compute()
+    {
+#if !CK_TILE_DISABLE_PACKED_FP32
+        __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 4, 0);
+#endif
+        static_for<0, Params::kMfmaPerWarpGemm1, 1>{}([&](auto) {
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::MFMA, 1, 0);
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 4, 0);
+        });
+    }
+
+    // Phase helper: load phase (memory/LDS loads) — VALU + SALU
+    CK_TILE_DEVICE static constexpr void schedule_load_phase()
+    {
+        __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 2, 0);
+        __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::SALU, 4, 0);
+    }
+
+    // Compose phases via WG0/WG1 phase-shift pattern:
+    //   WG0: compute0(P0), load(P1), compute1(P2), load(P3)
+    //   WG1: load(P0), compute0(P1), load(P2), compute1(P3)
     template <ck_tile::index_t WaveGroup, ck_tile::index_t Phase>
     CK_TILE_DEVICE static constexpr void schedule(ck_tile::number<WaveGroup>,
                                                   ck_tile::number<Phase>)
     {
-        using namespace ck_tile;
+        // WG1 is shifted by 3 phases (equivalently, -1 mod 4) relative to WG0
+        constexpr ck_tile::index_t effective = (WaveGroup == 0) ? Phase : (Phase + 3) % 4;
 
-        if constexpr(WaveGroup == 0)
-        {
-            if constexpr(Phase == 0)
-            {
-                static_for<0, 8, 1>{}([&](auto) {
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 2, 0); // TRANS
-                    __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                });
-            }
-            else if constexpr(Phase == 1)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
-            }
-            else if constexpr(Phase == 2)
-            {
-#if !CK_TILE_DISABLE_PACKED_FP32
-                __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
-#endif
-                static_for<0, 8, 1>{}([&](auto) {
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
-                });
-            }
-            else if constexpr(Phase == 3)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
-            }
-        }
+        if constexpr(effective == 0)
+            schedule_gemm0_compute();
+        else if constexpr(effective == 2)
+            schedule_gemm1_compute();
         else
-        {
-            if constexpr(Phase == 0)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
-            }
-            else if constexpr(Phase == 1)
-            {
-                static_for<0, 8, 1>{}([&](auto) {
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 2, 0); // TRANS
-                    __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                });
-            }
-            else if constexpr(Phase == 2)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
-            }
-            else if constexpr(Phase == 3)
-            {
-#if !CK_TILE_DISABLE_PACKED_FP32
-                __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
-#endif
-                static_for<0, 8, 1>{}([&](auto) {
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
-                });
-            }
-        }
+            schedule_load_phase();
     }
 };
 
+// ---------------------------------------------------------------------------
+// CoreLoopSchedulerImpl: dtype-specialized dispatch
+// ---------------------------------------------------------------------------
+template <typename PipelineProblem, typename QDataType, typename KDataType, typename VDataType>
+struct CoreLoopSchedulerImpl;
+
+// bf16 — uses default base
 template <typename PipelineProblem>
-struct CoreLoopScheduler<PipelineProblem, /*kIsMasking=*/false>
+struct CoreLoopSchedulerImpl<PipelineProblem, ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
+    : CoreLoopSchedulerDefaultBase<PipelineProblem>
 {
+};
+
+// fp16 — uses default base
+template <typename PipelineProblem>
+struct CoreLoopSchedulerImpl<PipelineProblem, ck_tile::half_t, ck_tile::half_t, ck_tile::half_t>
+    : CoreLoopSchedulerDefaultBase<PipelineProblem>
+{
+};
+
+// fp8 — asymmetric GEMM0 scheduling for 2× K iterations
+//
+// FP8 GEMM0 has 16 MFMAs (kKIter=2) but the same TRANS work as bf16/fp16 (softmax
+// exp count is dtype-independent).  The uniform (MFMA:1, TRANS:2, VALU:2) pattern
+// causes the compiler to front-load all 32 TRANS into MFMA #1, leaving MFMAs #2-8
+// with nothing to interleave (7 back-to-back MFMAs).
+//
+// Fix: split into two halves matching the natural K iteration boundary:
+//   K iter 0 (MFMAs 1-8):  TRANS-heavy — softmax exp + add reduction chain
+//   K iter 1 (MFMAs 9-16): VALU-heavy  — P scale + cvt_pk_fp8 + o_acc rescale
+template <typename PipelineProblem>
+struct CoreLoopSchedulerImpl<PipelineProblem, ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::fp8_t>
+    : CoreLoopSchedulerDefaultBase<PipelineProblem>
+{
+    using Base   = CoreLoopSchedulerDefaultBase<PipelineProblem>;
+    using Params = typename Base::Params;
+
+    CK_TILE_DEVICE static constexpr void schedule_gemm0_compute()
+    {
+        // K iter 0: 32 TRANS (v_exp_f32) + ~33 VALU (v_add reduction + permlane)
+        static_for<0, Params::kMfmaPerWarpGemm0 / 2, 1>{}([&](auto) {
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::MFMA, 1, 0);
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::TRANS, 4, 0);
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 4, 0);
+        });
+        // K iter 1: ~58 VALU (v_mul scale + v_cvt_pk_fp8 + o_acc rescale)
+        static_for<0, Params::kMfmaPerWarpGemm0 / 2, 1>{}([&](auto) {
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::MFMA, 1, 0);
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 6, 0);
+        });
+    }
+
+    // Phase helper: GEMM1 compute (PV matmul) — asymmetric for fmha_alu0 data dependency
+    //
+    // fmha_alu0 runs during PV GEMM on the OTHER sp buffer:
+    //   v_perm (byte packing) + v_max3 (row max) + permlane + v_fma (sp_delta)
+    //
+    // The v_fma chain depends on the serial max3→permlane→max→mul chain, creating
+    // a data dependency gap around MFMAs 8-11.  Use a looser VALU constraint for the
+    // second half to give the scheduler freedom to place v_fma where available.
+    CK_TILE_DEVICE static constexpr void schedule_gemm1_compute()
+    {
+#if !CK_TILE_DISABLE_PACKED_FP32
+        __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 4, 0);
+#endif
+        // First half: v_perm + v_max3 + permlane chain (~29 VALU)
+        static_for<0, Params::kMfmaPerWarpGemm1 / 2, 1>{}([&](auto) {
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::MFMA, 1, 0);
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 4, 0);
+        });
+        // Second half: v_fma chain (~33 VALU, data-dep limited at start)
+        static_for<0, Params::kMfmaPerWarpGemm1 / 2, 1>{}([&](auto) {
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::MFMA, 1, 0);
+            __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 3, 0);
+        });
+    }
+
+    // Must override schedule() — static methods have no virtual dispatch
     template <ck_tile::index_t WaveGroup, ck_tile::index_t Phase>
     CK_TILE_DEVICE static constexpr void schedule(ck_tile::number<WaveGroup>,
                                                   ck_tile::number<Phase>)
     {
-        using namespace ck_tile;
+        constexpr ck_tile::index_t effective = (WaveGroup == 0) ? Phase : (Phase + 3) % 4;
 
-        if constexpr(WaveGroup == 0)
-        {
-            if constexpr(Phase == 0)
-            {
-                static_for<0, 8, 1>{}([&](auto) {
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 2, 0); // TRANS
-                    __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                });
-            }
-            else if constexpr(Phase == 1)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
-            }
-            else if constexpr(Phase == 2)
-            {
-#if !CK_TILE_DISABLE_PACKED_FP32
-                __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
-#endif
-                static_for<0, 8, 1>{}([&](auto) {
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
-                });
-            }
-            else if constexpr(Phase == 3)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
-            }
-        }
+        if constexpr(effective == 0)
+            schedule_gemm0_compute();
+        else if constexpr(effective == 2)
+            schedule_gemm1_compute();
         else
-        {
-            if constexpr(Phase == 0)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
-            }
-            else if constexpr(Phase == 1)
-            {
-                static_for<0, 8, 1>{}([&](auto) {
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 2, 0); // TRANS
-                    __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                });
-            }
-            else if constexpr(Phase == 2)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x002, 2, 0); // VALU
-                __builtin_amdgcn_sched_group_barrier(0x004, 4, 0); // SALU
-            }
-            else if constexpr(Phase == 3)
-            {
-#if !CK_TILE_DISABLE_PACKED_FP32
-                __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
-#endif
-                static_for<0, 8, 1>{}([&](auto) {
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x002, 4, 0); // VALU
-                });
-            }
-        }
+            Base::schedule_load_phase();
     }
 };
 
+// ---------------------------------------------------------------------------
+// CoreLoopScheduler: user-facing template, delegates to dtype-specialized impl
+// ---------------------------------------------------------------------------
+template <typename PipelineProblem>
+struct CoreLoopScheduler : CoreLoopSchedulerImpl<PipelineProblem,
+                                                 typename PipelineProblem::QDataType,
+                                                 typename PipelineProblem::KDataType,
+                                                 typename PipelineProblem::VDataType>
+{
+};
+
 namespace detail {
-CK_TILE_DEVICE float fma_impl_vsv(float a, float b, float c)
-{
-#if CK_TILE_DISABLE_PACKED_FP32
-    return a * b + c;
-#else
-    float result;
-    asm volatile("v_fma_f32 %[result], %[a], %[b], %[c]"
-                 : [result] "=v"(result)
-                 : [a] "v"(a), [b] "s"(b), [c] "v"(c));
-    return result;
-#endif
-}
+CK_TILE_DEVICE float fma_impl_vsv(float a, float b, float c) { return a * b + c; }
 
 CK_TILE_DEVICE float add_impl_vv(float lhs, float rhs)
 {
@@ -237,6 +255,19 @@ CK_TILE_DEVICE fp32x2_t pk_mul_f32(fp32x2_t lhs, fp32x2_t rhs)
                  : [lhs] "v"(lhs), [rhs] "v"(rhs));
     return result;
 }
+
+/// FP8 packed conversion with asm volatile to prevent code sinking.
+/// This anchors the conversion instruction in Phase 0, and all predecessor
+/// instructions (scale, saturate, NaN check) will automatically stay in Phase 0.
+/// v_cvt_pk_fp8_f32 packs two FP8 values into lower 16 bits of a 32-bit VGPR.
+CK_TILE_DEVICE uint32_t cvt_pk_fp8_f32(float a, float b)
+{
+    uint32_t result;
+    asm volatile("v_cvt_pk_fp8_f32 %[result], %[a], %[b]"
+                 : [result] "=v"(result)
+                 : [a] "v"(a), [b] "v"(b));
+    return result;
+}
 } // namespace detail
 
 /// NOTICE: This pipeline is a work in progress and is awaiting upcoming compiler fixes and
@@ -290,10 +321,9 @@ struct BlockFmhaFwdV3Pipeline
     static constexpr bool kHasDropout       = Problem::kHasDropout;
     static constexpr auto QScaleEnum        = Problem::QScaleEnum;
     static constexpr bool kSkipMinSeqlenQ   = Problem::kSkipMinSeqlenQ;
-    static_assert((BiasEnum == BlockAttentionBiasEnum::NO_BIAS && !kStoreLSE && !kHasDropout &&
-                   (QScaleEnum == ck_tile::BlockAttentionQuantScaleEnum::NO_SCALE) &&
-                   !kSkipMinSeqlenQ),
+    static_assert((BiasEnum == BlockAttentionBiasEnum::NO_BIAS && !kHasDropout && !kSkipMinSeqlenQ),
                   "enable unsupported features");
+    // HACK: Removed !kStoreLSE check to allow BF16 V3 compilation for assembly analysis
 
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
@@ -318,35 +348,7 @@ struct BlockFmhaFwdV3Pipeline
 
     CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
     {
-        // create another LDS buffer for p
-        return ck_tile::max(kM0 * kN1 * sizeof(PDataType),
-                            Policy::template GetSmemSize<Problem>() +
-                                kM0 * kN0 * sizeof(PDataType));
-    }
-
-    // for debug only
-    template <ck_tile::index_t MPerBlock, ck_tile::index_t NPerBlock>
-    CK_TILE_DEVICE static constexpr auto MakeSimpleLdsDesc()
-    {
-        using namespace ck_tile;
-        constexpr auto lds_block_desc =
-            make_naive_tensor_descriptor(make_tuple(number<MPerBlock>{}, number<NPerBlock>{}),
-                                         make_tuple(number<NPerBlock>{}, number<1>{}),
-                                         number<1>{},
-                                         number<1>{});
-
-        return lds_block_desc;
-    }
-
-    // for debug only
-    template <ck_tile::index_t MPerBlock>
-    CK_TILE_DEVICE static constexpr auto MakeSimpleLdsDesc1D()
-    {
-        using namespace ck_tile;
-        constexpr auto lds_block_desc = make_naive_tensor_descriptor(
-            make_tuple(number<MPerBlock>{}), make_tuple(number<1>{}), number<1>{}, number<1>{});
-
-        return lds_block_desc;
+        return Policy::template GetSmemSize<Problem>();
     }
 
     template <typename DataType, typename Descriptor>
@@ -359,29 +361,6 @@ struct BlockFmhaFwdV3Pipeline
         return make_tile_window(tensor_view, desc.get_lengths(), {0, 0});
     }
 
-    // vmcnt=0~63, lgkmcnt=0~15, expcnt=0~7
-    template <uint16_t Vmcnt, uint8_t Lgkmcnt, uint8_t Expcnt = 7>
-    CK_TILE_DEVICE static constexpr void s_waitcnt()
-    {
-        // vmcnt use bits {[15:14],[3:0]}
-        // expcnt use bits [6:4]
-        // lgkmcnt use bits [11:8]
-        __builtin_amdgcn_s_waitcnt((((0b110000 & Vmcnt) << (14 - 4)) | (0b1111 & Vmcnt)) |
-                                   ((0b111 & Expcnt) << 4) | ((0b1111 & Lgkmcnt) << 8));
-    }
-
-    template <uint16_t Vmcnt>
-    CK_TILE_DEVICE static constexpr void s_waitcnt_vmcnt()
-    {
-        s_waitcnt<Vmcnt, 15>();
-    }
-
-    template <uint8_t Lgkmcnt>
-    CK_TILE_DEVICE static constexpr void s_waitcnt_lgkmcnt()
-    {
-        s_waitcnt<63, Lgkmcnt>();
-    }
-
     template <typename QDramBlockWindowTmp,
               typename KDramBlockWindowTmp,
               typename VDramBlockWindowTmp,
@@ -395,23 +374,27 @@ struct BlockFmhaFwdV3Pipeline
               typename OAccElementFunction,
               typename AttentionVariantParams,
               typename BlockIndices>
-    CK_TILE_DEVICE auto operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
-                                   const QElementFunction& q_element_func,
-                                   const KDramBlockWindowTmp& k_dram_block_window_tmp, // N0*K0 tile
-                                   [[maybe_unused]] const KElementFunction& k_element_func,
-                                   const VDramBlockWindowTmp& v_dram_block_window_tmp, // N1*K1 tile
-                                   [[maybe_unused]] const VElementFunction& v_element_func,
-                                   LSEDramBlockWindowTmp& lse_dram_window_tmp, // M0*1 tile
-                                   const LSEElementFunction& lse_element_func,
-                                   [[maybe_unused]] const SAccElementFunction& s_acc_element_func,
-                                   const PComputeElementFunction& p_compute_element_func,
-                                   const OAccElementFunction& o_acc_element_func,
-                                   FmhaMask mask,
-                                   float scale_s,
-                                   const AttentionVariant& variant,
-                                   const AttentionVariantParams& variant_params,
-                                   const BlockIndices& block_indices,
-                                   void* smem_ptr) const
+    CK_TILE_DEVICE auto
+    operator()(const QDramBlockWindowTmp& __restrict__ q_dram_block_window_tmp, // M0*K0 tile
+               const QElementFunction& q_element_func,
+               const KDramBlockWindowTmp& __restrict__ k_dram_block_window_tmp, // N0*K0 tile
+               [[maybe_unused]] const KElementFunction& k_element_func,
+               const VDramBlockWindowTmp& __restrict__ v_dram_block_window_tmp, // N1*K1 tile
+               [[maybe_unused]] const VElementFunction& v_element_func,
+               LSEDramBlockWindowTmp& lse_dram_window_tmp, // M0*1 tile
+               const LSEElementFunction& lse_element_func,
+               [[maybe_unused]] const SAccElementFunction& s_acc_element_func,
+               const PComputeElementFunction& p_compute_element_func,
+               const OAccElementFunction& o_acc_element_func,
+               FmhaMask mask,
+               float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
+               KDataType* __restrict__ smem_k0,
+               KDataType* __restrict__ smem_k1,
+               VDataType* __restrict__ smem_v0,
+               VDataType* __restrict__ smem_v1) const
     {
         using namespace ck_tile;
 
@@ -428,33 +411,6 @@ struct BlockFmhaFwdV3Pipeline
                           kN1 == VDramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
                       "wrong!");
 
-        static_assert(sizeof(SaccDataType) * kM0 * kN0 <= GetSmemSize());
-        auto s_lds = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<SaccDataType*>(static_cast<char*>(smem_ptr)),
-            MakeSimpleLdsDesc<kM0, kN0>());
-        [[maybe_unused]] auto s_lds_window =
-            make_tile_window(s_lds, make_tuple(number<kM0>{}, number<kN0>{}), {0, 0});
-
-        auto p_lds = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<PDataType*>(static_cast<char*>(smem_ptr) +
-                                         Policy::template GetSmemSize<Problem>()),
-            MakeSimpleLdsDesc<kM0, kN0>());
-        [[maybe_unused]] auto p_lds_window =
-            make_tile_window(p_lds, make_tuple(number<kM0>{}, number<kN0>{}), {0, 0});
-
-        auto o_lds = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<PDataType*>(static_cast<char*>(smem_ptr)),
-            MakeSimpleLdsDesc<kM0, kN1>());
-        [[maybe_unused]] auto o_lds_window =
-            make_tile_window(o_lds, make_tuple(number<kM0>{}, number<kN1>{}), {0, 0});
-
-        auto m_lds = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<SMPLComputeDataType*>(static_cast<char*>(smem_ptr) +
-                                                   Policy::template GetSmemSize<Problem>()),
-            MakeSimpleLdsDesc1D<kM0>());
-        [[maybe_unused]] auto m_lds_window =
-            make_tile_window(m_lds, make_tuple(number<kM0>{}), {0});
-
         const index_t warp_group_id = get_warp_id() / 4;
 
         // Block GEMM
@@ -469,16 +425,18 @@ struct BlockFmhaFwdV3Pipeline
         const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
 
         auto k_lds_window_store = generate_tuple(
-            [&](auto i_buf) {
+            [&](auto write_idx) {
+                auto k_buf = (write_idx == 0 ? smem_k0 : smem_k1);
                 return make_lds_tile_window<KDataType>(
-                    smem_ptr, Policy::template MakeKLdsStoreBlockDescriptor<Problem>(i_buf));
+                    k_buf, Policy::template MakeKLdsStoreBlockDescriptor<Problem>());
             },
             number<2>{});
 
         auto v_lds_window_store = generate_tuple(
-            [&](auto i_buf) {
-                return make_lds_tile_window<KDataType>(
-                    smem_ptr, Policy::template MakeVLdsStoreBlockDescriptor<Problem>(i_buf));
+            [&](auto write_idx) {
+                auto v_buf = (write_idx == 0 ? smem_v0 : smem_v1);
+                return make_lds_tile_window<VDataType>(
+                    v_buf, Policy::template MakeVLdsStoreBlockDescriptor<Problem>());
             },
             number<2>{});
 
@@ -521,9 +479,11 @@ struct BlockFmhaFwdV3Pipeline
         statically_indexed_array<sp_compute_type, 2> sp;
 
         decltype(gemm_1.MakeCBlockTile()) o_acc;
-        constexpr index_t fmha_alu_D_reg_cnt = 6; // threshold to decide how many fmha_alu_D_upd()
-                                                  // instructions should we move to fmha_alu1()
-        static_assert(fmha_alu_D_reg_cnt <= o_acc.thread_buf_.size());
+        constexpr index_t fmha_alu_D_reg_cnt =
+            6; // Threshold for determining how many fmha_alu_D_upd() unpacked
+               // instructions to relocate to fmha_alu1().
+        static_assert(fmha_alu_D_reg_cnt % 2 == 0 &&
+                      fmha_alu_D_reg_cnt <= o_acc.thread_buf_.size());
 
         decltype(block_tile_reduce<SMPLComputeDataType>(
             sp(number<0>{}).sp_compute, sequence<1>{}, f_max, SMPLComputeDataType{0})) m;
@@ -531,18 +491,27 @@ struct BlockFmhaFwdV3Pipeline
 
         // initialize k_lds_window and v_lds_window
         static_for<0, 2, 1>{}([&](auto idx) {
-            k_lds_window_load(idx) = make_tile_window(
-                make_lds_tile_window<KDataType>(
-                    static_cast<char*>(smem_ptr) + (idx)*Policy::template GetSmemSizeKV<Problem>(),
-                    Policy::template MakeKLdsLoadBlockDescriptor<Problem>()),
-                Policy::template MakeKRegTileDistribution<Problem>());
+            k_lds_window_load(idx) =
+                make_tile_window(make_lds_tile_window<KDataType>(
+                                     [&] {
+                                         if constexpr(idx == 0)
+                                             return smem_k0;
+                                         else
+                                             return smem_k1;
+                                     }(),
+                                     Policy::template MakeKLdsLoadBlockDescriptor<Problem>()),
+                                 Policy::template MakeKRegTileDistribution<Problem>());
         });
 
         static_for<0, 2, 1>{}([&](auto idx) {
             v_lds_window_load(idx) =
                 make_tile_window(make_lds_tile_window<VDataType>(
-                                     static_cast<char*>(smem_ptr) +
-                                         (idx + 2) * Policy::template GetSmemSizeKV<Problem>(),
+                                     [&] {
+                                         if constexpr(idx == 0)
+                                             return smem_v0;
+                                         else
+                                             return smem_v1;
+                                     }(),
                                      Policy::template MakeVLdsLoadBlockDescriptor<Problem>()),
                                  Policy::template MakeVRegTileDistribution<Problem>());
         });
@@ -591,14 +560,12 @@ struct BlockFmhaFwdV3Pipeline
                              k_dram_block_window_tmp.get_window_lengths(),
                              {seqlen_k_start, 0},
                              Policy::template MakeKDramTileDistribution<Problem>());
-        k_dram_window.init_raw();
 
         auto v_dram_window =
             make_tile_window(v_dram_block_window_tmp.get_bottom_tensor_view(),
                              v_dram_block_window_tmp.get_window_lengths(),
                              {seqlen_k_start, 0}, // TODO: hdim split?
                              Policy::template MakeVDramTileDistribution<Problem>());
-        v_dram_window.init_raw();
 
         // prefetch K tile
         index_t i_total_loops      = 0;
@@ -611,86 +578,13 @@ struct BlockFmhaFwdV3Pipeline
         constexpr index_t NumWarpGroups = Problem::kBlockSize / Policy::NumThreadPerWarpGroup;
         static_assert(NumWarpGroups == 2);
 
-        [[maybe_unused]] auto print_dist_tensor = [&](const auto& dist_tensor, const char* name) {
-            printf("[POYENC] %s (size=%d): %5.2f",
-                   name,
-                   decltype(dist_tensor.thread_buf_)::size(),
-                   ck_tile::type_convert<float>(dist_tensor.thread_buf_[0]));
-            static_for<1, decltype(dist_tensor.thread_buf_)::size(), 1>{}([&](auto i) {
-                printf(", %5.2f", ck_tile::type_convert<float>(dist_tensor.thread_buf_[i]));
-            });
-            printf("\n");
-        };
-
-        [[maybe_unused]] auto print_lds = [&](auto lds_tile_window, const char* name) {
-            const auto num_rows = lds_tile_window.get_window_lengths().at(number<0>{});
-            const auto num_cols = lds_tile_window.get_window_lengths().at(number<1>{});
-
-            auto desc = lds_tile_window.get_bottom_tensor_view().desc_;
-            auto data = lds_tile_window.get_bottom_tensor_view().buf_.p_data_;
-
-            if constexpr(true || num_rows < num_cols)
-            {
-                for(int row = 0; row < num_rows; ++row)
-                {
-                    int offset = desc.calculate_offset(make_tuple(row, 0));
-                    printf("[DEVICE] %s[%3d] = %5.2f",
-                           name,
-                           row,
-                           ck_tile::type_convert<float>(data[offset]));
-                    for(int col = 1; col < num_cols; ++col)
-                    {
-                        printf(", ");
-                        offset = desc.calculate_offset(make_tuple(row, col));
-                        printf("%5.2f", ck_tile::type_convert<float>(data[offset]));
-                    }
-                    printf("\n");
-                }
-            }
-            else
-            {
-                for(int col = 0; col < num_cols; ++col)
-                {
-                    int offset = desc.calculate_offset(make_tuple(0, col));
-                    printf("[DEVICE] %s[%3d] = %5.2f",
-                           name,
-                           col,
-                           ck_tile::type_convert<float>(data[offset]));
-                    for(int row = 1; row < num_rows; ++row)
-                    {
-                        printf(", ");
-                        offset = desc.calculate_offset(make_tuple(row, col));
-                        printf("%5.2f", ck_tile::type_convert<float>(data[offset]));
-                    }
-                    printf("\n");
-                }
-            }
-        };
-
-        [[maybe_unused]] auto print_lds_1d = [&](auto lds_tile_window, const char* name) {
-            const auto num_elems = lds_tile_window.get_window_lengths().at(number<0>{});
-
-            auto desc = lds_tile_window.get_bottom_tensor_view().desc_;
-            auto data = lds_tile_window.get_bottom_tensor_view().buf_.p_data_;
-
-            int offset = desc.calculate_offset(make_tuple(0));
-            printf("[DEVICE] %s = %5.2f", name, ck_tile::type_convert<float>(data[offset]));
-            for(int e = 1; e < num_elems; ++e)
-            {
-                printf(", ");
-                offset = desc.calculate_offset(make_tuple(e));
-                printf("%5.2f", ck_tile::type_convert<float>(data[offset]));
-            }
-            printf("\n");
-        };
-
         // K_mem_su_ld_insts = 1 for 32 x 128
         // V_mem_su_ld_insts = 1 for 128 x 32
         constexpr int K_mem_su_ld_insts = k_dram_window.get_num_of_access();
         constexpr int V_mem_su_ld_insts = v_dram_window.get_num_of_access();
 
         auto K_mem_load = [&](auto k_lds_write_idx) {
-            async_load_tile_raw(k_lds_window_store(k_lds_write_idx), k_dram_window);
+            async_load_tile(k_lds_window_store(k_lds_write_idx), k_dram_window);
 
             /// FIXME: use the future-predicting method to move the window
             // move K tile windows
@@ -702,7 +596,7 @@ struct BlockFmhaFwdV3Pipeline
         };
 
         auto V_mem_load = [&](auto v_lds_write_idx) {
-            async_load_tile_raw(v_lds_window_store(v_lds_write_idx), v_dram_window);
+            async_load_tile(v_lds_window_store(v_lds_write_idx), v_dram_window);
 
             /// FIXME: use the future-predicting method to move the window
             move_tile_window(v_dram_window, {kK1, 0});
@@ -735,24 +629,8 @@ struct BlockFmhaFwdV3Pipeline
 
         auto fmha_alu0 = [&](auto sp_reg_idx) {
             m_old = m; // m{j-1}
-            static_assert(m.thread_buf_.size() == 1,
-                          "assuming that each thread holds 1 rowmax value");
-            auto m_latest = block_tile_reduce<SMPLComputeDataType>(
-                sp(sp_reg_idx).sp_compute, sequence<1>{}, f_max, m.thread_buf_[0]);
-#if defined(__gfx950__)
-            // assuming that we are using 32x32 mfma
-            int32x2_t swapped_regs =
-                __builtin_amdgcn_permlane32_swap(bit_cast<int32_t>(m_latest.thread_buf_[0]),
-                                                 bit_cast<int32_t>(m_latest.thread_buf_[0]),
-                                                 false,
-                                                 false);
-            /// TODO: eliminate 2 redudant v_max_f32 instructions generated by the compiler
-            m_latest.thread_buf_[0] = f_max(bit_cast<SMPLComputeDataType>(swapped_regs.x),
-                                            bit_cast<SMPLComputeDataType>(swapped_regs.y));
-#else
-            block_tile_reduce_sync(m_latest, f_max, bool_constant<false>{});
-#endif
-            m = m_latest;
+            block_tile_reduce(m, sp(sp_reg_idx).sp_compute, sequence<1>{}, f_max);
+            block_tile_reduce_sync(m, f_max, bool_constant<false>{}, bool_constant<false>{});
 
             constexpr auto p_spans =
                 std::decay_t<decltype(sp(sp_reg_idx).sp_compute)>::get_distributed_spans();
@@ -771,7 +649,8 @@ struct BlockFmhaFwdV3Pipeline
                     }
                 });
             });
-            /// TODO: move some fmha_alu1() code here if necessary
+            /// NOTE: moving exp2(sp_delta) here was explored and reverted (~1.1% regression).
+            /// See session.md for details.
         };
 
         auto fmha_alu1 = [&](auto sp_reg_idx) {
@@ -790,20 +669,7 @@ struct BlockFmhaFwdV3Pipeline
                 sequence<1>{},
                 f_sum,
                 SMPLComputeDataType{0}); // rowsum(Pcompute{j})
-            static_assert(rowsum_p.thread_buf_.size() == 1,
-                          "assuming that each thread holds 1 rowsum value");
-#if defined(__gfx950__)
-            // assuming that we are using 32x32 mfma
-            int32x2_t swapped_regs =
-                __builtin_amdgcn_permlane32_swap(bit_cast<int32_t>(rowsum_p.thread_buf_[0]),
-                                                 bit_cast<int32_t>(rowsum_p.thread_buf_[0]),
-                                                 false,
-                                                 false);
-            rowsum_p.thread_buf_[0] = f_sum(bit_cast<SMPLComputeDataType>(swapped_regs.x),
-                                            bit_cast<SMPLComputeDataType>(swapped_regs.y));
-#else
-            block_tile_reduce_sync(rowsum_p, f_sum, bool_constant<false>{});
-#endif
+            block_tile_reduce_sync(rowsum_p, f_sum, bool_constant<false>{}, bool_constant<false>{});
 
             // l{j}
             /// Note: The compiler keeps moving the following instructions elsewhere because 'l'
@@ -845,12 +711,26 @@ struct BlockFmhaFwdV3Pipeline
                     sp(sp_reg_idx).p.thread_buf_[idx]     = casted.x;
                     sp(sp_reg_idx).p.thread_buf_[idx + 1] = casted.y;
                 }
-                else
+                else if constexpr(std::is_same_v<PDataType, bf16_t>)
                 {
                     auto casted                           = ck_tile::cvt_pk_bf16_f32(x, y);
                     sp(sp_reg_idx).p.thread_buf_[idx]     = casted.x;
                     sp(sp_reg_idx).p.thread_buf_[idx + 1] = casted.y;
                 }
+                else if constexpr(std::is_same_v<PDataType, fp8_t>)
+                {
+                    // Use asm volatile wrapper to prevent code sinking
+                    // v_cvt_pk_fp8_f32 packs two FP8 into lower 16 bits of 32-bit result
+                    uint32_t packed = detail::cvt_pk_fp8_f32(x, y);
+                    sp(sp_reg_idx).p.thread_buf_[idx] =
+                        bit_cast<fp8_t>(static_cast<uint8_t>(packed & 0xFF));
+                    sp(sp_reg_idx).p.thread_buf_[idx + 1] =
+                        bit_cast<fp8_t>(static_cast<uint8_t>((packed >> 8) & 0xFF));
+                }
+                else
+                {
+                    static_assert(false, "unsupported data type for P");
+                }
             });
 
             /// Note: Place fmha_alu1() at the end of the phase. The surrounding inline assembly
@@ -907,7 +787,14 @@ struct BlockFmhaFwdV3Pipeline
             }
         };
 
-        auto fmha_alu_D_upd = [&] {
+        // Number of o_acc registers rescaled with unpacked (scalar) v_mul_f32 before the
+        // scheduler, so the compiler can interleave them with MFMA tail slots.  The remaining
+        // registers are rescaled with packed v_pk_mul_f32 (asm volatile, invisible to the
+        // scheduler) after the scheduler.  Set to 0 to use packed multiply for all registers
+        // beyond fmha_alu_D_reg_cnt; increase to feed the scheduler more visible VALU work.
+        constexpr index_t num_unpack_insts = 0;
+        fp32x2_t pk_o_acc_scale;
+        auto fmha_alu_D_upd_unpack = [&] {
             o_acc_scale = [&] {
                 if constexpr(kHasLogitsSoftCap)
                 {
@@ -919,28 +806,20 @@ struct BlockFmhaFwdV3Pipeline
                 }
             }();
 
-            fp32x2_t pk_o_acc_scale;
+            static_assert(num_unpack_insts % 2 == 0 &&
+                          (fmha_alu_D_reg_cnt + num_unpack_insts) <= o_acc.thread_buf_.size());
+            static_for<fmha_alu_D_reg_cnt, fmha_alu_D_reg_cnt + num_unpack_insts, 1>{}(
+                [&](auto idx) { o_acc.thread_buf_[idx] *= o_acc_scale; });
             pk_o_acc_scale.x = o_acc_scale;
             pk_o_acc_scale.y = o_acc_scale;
+        };
 
-            static_assert((o_acc.thread_buf_.size() - fmha_alu_D_reg_cnt) % 2 == 0);
-#if CK_TILE_DISABLE_PACKED_FP32
-            static_assert(fmha_alu_D_reg_cnt + 2 <= o_acc.thread_buf_.size());
-            static_for<fmha_alu_D_reg_cnt, fmha_alu_D_reg_cnt + 2, 1>{}(
-                [&](auto idx) { o_acc.thread_buf_[idx] *= o_acc_scale; });
-#endif
-
-            constexpr auto issued_D_reg_cnt =
-#if CK_TILE_DISABLE_PACKED_FP32
-                fmha_alu_D_reg_cnt + 2
-#else
-                fmha_alu_D_reg_cnt
-#endif
-                ;
+        auto fmha_alu_D_upd_pack = [&] {
+            constexpr index_t issued_unpack_insts = fmha_alu_D_reg_cnt + num_unpack_insts;
             /// NOTICE: Use inline asm v_pk_mul_f32 to reduce latency. The fmha_alu_D_upd() call
             /// should be placed at the end of a phase.
-            // update partial o_acc after [issued_D_reg_cnt]
-            static_for<issued_D_reg_cnt, o_acc.thread_buf_.size(), 2>{}([&](auto idx) {
+            // update partial o_acc after [issued_unpack_insts]
+            static_for<issued_unpack_insts, o_acc.thread_buf_.size(), 2>{}([&](auto idx) {
                 fp32x2_t input;
                 input.x = o_acc.thread_buf_[idx];
                 input.y = o_acc.thread_buf_[idx + 1];
@@ -952,6 +831,11 @@ struct BlockFmhaFwdV3Pipeline
             });
         };
 
+        auto fmha_alu_D_upd = [&] {
+            fmha_alu_D_upd_unpack();
+            fmha_alu_D_upd_pack();
+        };
+
         auto fmha_mask = [&](auto sp_reg_idx) {
             if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
             {
@@ -996,7 +880,7 @@ struct BlockFmhaFwdV3Pipeline
             auto memV = number<0>{};
             auto memK = number<1>{};
 
-            using Scheduler = CoreLoopScheduler<Problem, FmhaMask::IsMasking>;
+            using Scheduler = CoreLoopScheduler<Problem>;
 
             auto iteration = [&](auto pi) {
                 auto xdl_SP_p01_reg_idx = number<1>{} - pi;
@@ -1030,7 +914,7 @@ struct BlockFmhaFwdV3Pipeline
                     {
                         ASM_MARKER("phase0 Wave0-3 (pi=1)");
                     }
-                    s_waitcnt_lgkmcnt<0>();
+                    s_waitcnt<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, 0>();
                     __builtin_amdgcn_sched_barrier(0);
                     cl_calc(xdl_SP_p01_reg_idx, gemm0);
                     fmha_alu1(xdl_SP_p23_reg_idx);
@@ -1040,7 +924,7 @@ struct BlockFmhaFwdV3Pipeline
                     __builtin_amdgcn_sched_barrier(0);
                     // phase1
                     ASM_MARKER("phase1 Wave0-3");
-                    s_waitcnt_vmcnt<K_mem_su_ld_insts + V_mem_su_ld_insts>();
+                    s_waitcnt<K_mem_su_ld_insts + V_mem_su_ld_insts>();
                     __builtin_amdgcn_sched_barrier(0);
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
@@ -1051,22 +935,22 @@ struct BlockFmhaFwdV3Pipeline
                     __builtin_amdgcn_sched_barrier(0);
                     // phase2
                     ASM_MARKER("phase2 Wave0-3");
-                    s_waitcnt_lgkmcnt<0>();
+                    s_waitcnt<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, 0>();
                     __builtin_amdgcn_sched_barrier(0);
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
                     asm volatile("s_nop 0");
                     __builtin_amdgcn_sched_barrier(0);
                     cl_calc(xdl_SP_p23_reg_idx, gemm1);
-
+                    fmha_alu_D_upd_unpack();
                     Scheduler::schedule(cl_p, number<2>{});
                     __builtin_amdgcn_sched_barrier(0);
-                    fmha_alu_D_upd();
+                    fmha_alu_D_upd_pack();
 
                     __builtin_amdgcn_sched_barrier(0);
                     // phase3
                     ASM_MARKER("phase3 Wave0-3");
-                    s_waitcnt_vmcnt<K_mem_su_ld_insts + V_mem_su_ld_insts>();
+                    s_waitcnt<K_mem_su_ld_insts + V_mem_su_ld_insts>();
                     __builtin_amdgcn_sched_barrier(0);
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
@@ -1101,7 +985,7 @@ struct BlockFmhaFwdV3Pipeline
                     __builtin_amdgcn_sched_barrier(0);
                     // phase1
                     ASM_MARKER("phase1 Wave4-7");
-                    s_waitcnt<K_mem_su_ld_insts + V_mem_su_ld_insts, 0>();
+                    s_waitcnt<K_mem_su_ld_insts + V_mem_su_ld_insts, waitcnt_arg::kMaxExpCnt, 0>();
                     __builtin_amdgcn_sched_barrier(0);
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
@@ -1130,17 +1014,17 @@ struct BlockFmhaFwdV3Pipeline
                     __builtin_amdgcn_sched_barrier(0);
                     // phase3
                     ASM_MARKER("phase3 Wave4-7");
-                    s_waitcnt<K_mem_su_ld_insts + V_mem_su_ld_insts, 0>();
+                    s_waitcnt<K_mem_su_ld_insts + V_mem_su_ld_insts, waitcnt_arg::kMaxExpCnt, 0>();
                     __builtin_amdgcn_sched_barrier(0);
                     __builtin_amdgcn_s_barrier();
                     __builtin_amdgcn_sched_barrier(0);
                     asm volatile("s_nop 1");
                     __builtin_amdgcn_sched_barrier(0);
                     cl_calc(xdl_SP_p23_reg_idx, gemm1);
-
+                    fmha_alu_D_upd_unpack();
                     Scheduler::schedule(cl_p, number<3>{});
                     __builtin_amdgcn_sched_barrier(0);
-                    fmha_alu_D_upd();
+                    fmha_alu_D_upd_pack();
                 }
                 return result;
             };
@@ -1153,18 +1037,18 @@ struct BlockFmhaFwdV3Pipeline
 
             if(1 < num_total_loop)
             {
-                s_waitcnt_vmcnt<K_mem_su_ld_insts>();
+                s_waitcnt<K_mem_su_ld_insts>();
             }
             else
             {
-                s_waitcnt_vmcnt<0>();
+                s_waitcnt<0>();
             }
             __builtin_amdgcn_s_barrier();
 
             V_lds_load(V_lds_rd_idx);
             fmha_alu1(ps_pi);
 
-            s_waitcnt_lgkmcnt<0>();
+            s_waitcnt<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, 0>();
 
             auto xdl_SP_p23_reg_idx = ps_pi;
             gemm(xdl_SP_p23_reg_idx, /*gemm_idx=*/number<1>{});
@@ -1176,12 +1060,12 @@ struct BlockFmhaFwdV3Pipeline
             // (1) load K0 to LDS & VGPR
             K_mem_load(number<0>{}); // mem_K0
 
-            s_waitcnt_vmcnt<0>();
+            s_waitcnt<0>();
             __builtin_amdgcn_s_barrier();
 
             K_lds_load(number<0>{}); // lds_K0
 
-            s_waitcnt_lgkmcnt<0>();
+            s_waitcnt<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, 0>();
             __builtin_amdgcn_s_barrier();
 
             // (2) prefetch K1 and V0 to LDS in parallel with GEMM0
@@ -1209,11 +1093,12 @@ struct BlockFmhaFwdV3Pipeline
             if(2 < num_total_loop)
             {
                 K_mem_load(number<0>{}); // mem_K2
-
-                s_waitcnt_vmcnt<K_mem_su_ld_insts + V_mem_su_ld_insts>();
-                __builtin_amdgcn_s_barrier();
             }
 
+            // drain K1 + V0 async loads before core_loop reads K1 from LDS
+            s_waitcnt<K_mem_su_ld_insts + V_mem_su_ld_insts>();
+            __builtin_amdgcn_s_barrier();
+
             ASM_MARKER("end pre-stage");
         }
 
@@ -1291,16 +1176,20 @@ struct BlockFmhaFwdV3Pipeline
               typename LSEDramBlockWindowTmp,
               typename AttentionVariantParams,
               typename BlockIndices>
-    CK_TILE_DEVICE auto operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
-                                   const KDramBlockWindowTmp& k_dram_block_window_tmp, // N0*K0 tile
-                                   const VDramBlockWindowTmp& v_dram_block_window_tmp, // N1*K1 tile
-                                   LSEDramBlockWindowTmp& lse_dram_block_window_tmp,   // M0*1 tile
-                                   FmhaMask mask,
-                                   float scale_s,
-                                   const AttentionVariant& variant,
-                                   const AttentionVariantParams& variant_params,
-                                   const BlockIndices& block_indices,
-                                   void* smem_ptr) const
+    CK_TILE_DEVICE auto
+    operator()(const QDramBlockWindowTmp& __restrict__ q_dram_block_window_tmp, // M0*K0 tile
+               const KDramBlockWindowTmp& __restrict__ k_dram_block_window_tmp, // N0*K0 tile
+               const VDramBlockWindowTmp& __restrict__ v_dram_block_window_tmp, // N1*K1 tile
+               LSEDramBlockWindowTmp& lse_dram_block_window_tmp,                // M0*1 tile
+               FmhaMask mask,
+               float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
+               KDataType* __restrict__ smem_k0,
+               KDataType* __restrict__ smem_k1,
+               VDataType* __restrict__ smem_v0,
+               VDataType* __restrict__ smem_v1) const
     {
         using namespace ck_tile;
 
@@ -1320,7 +1209,10 @@ struct BlockFmhaFwdV3Pipeline
                           variant,
                           variant_params,
                           block_indices,
-                          smem_ptr);
+                          smem_k0,
+                          smem_k1,
+                          smem_v0,
+                          smem_v1);
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline_default_policy.hpp
index ce097b6741..a6b21ac555 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline_default_policy.hpp
@@ -239,10 +239,18 @@ struct BlockFmhaV3PipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm0BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
 
-        constexpr auto warp_gemm = []() {
-            if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
-                         std::is_same_v<typename Problem::KDataType, half_t> &&
+        constexpr auto warp_gemm = [] {
+            if constexpr(std::is_same_v<typename Problem::QDataType, fp8_t> &&
+                         std::is_same_v<typename Problem::KDataType, fp8_t> &&
                          std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                // Use SwizzleB variant to get 8 contiguous K positions per lane,
+                // matching the V tile distribution for PV GEMM
+                return WarpGemmMfmaFp8Fp8F32M32N32K32SwizzleBTransposedCDistribution<>{};
+            }
+            else if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
+                              std::is_same_v<typename Problem::KDataType, half_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
             {
                 /// NOTICE: in order to use load_tile_transpose() later for V tile, we cannot use
                 /// WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution here
@@ -310,9 +318,8 @@ struct BlockFmhaV3PipelineDefaultPolicy
     static constexpr ck_tile::index_t kKLdsPadInBytes = 4 * 4;  // 4 dwords
     static constexpr ck_tile::index_t kVLdsPadInBytes = 4 * 16; // 16 dwords
 
-    template <typename Problem, ck_tile::index_t IBuf = 0>
-    CK_TILE_DEVICE static constexpr auto
-    MakeKLdsStoreBlockDescriptor(ck_tile::number<IBuf> = ck_tile::number<0>{})
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeKLdsStoreBlockDescriptor()
     {
         using namespace ck_tile;
 
@@ -323,7 +330,6 @@ struct BlockFmhaV3PipelineDefaultPolicy
         constexpr index_t NumWarps   = Problem::BlockFmhaShape::NumWarps;
         constexpr index_t WarpSize   = ck_tile::get_warp_size();
 
-        [[maybe_unused]] constexpr index_t KPack = GetSmemKPackK<Problem>(); // this is for lds
         constexpr index_t KVector = GetAlignmentK<Problem>(); // this is for global load
         constexpr index_t kPad =
             kKLdsPadInBytes /
@@ -339,31 +345,28 @@ struct BlockFmhaV3PipelineDefaultPolicy
         constexpr index_t NumIssues = kNPerBlock / (LaneGroups * NumWarps);
         static_assert(NumIssues == kNPerBlock * kKPerBlock / (kBlockSize * KVector));
 
-        constexpr auto k_lds_block_desc_0 = make_naive_tensor_descriptor_with_offset(
-            make_tuple(number<NumIssues>{},  // n0
-                       number<LaneGroups>{}, // n1
-                       number<NumWarps>{},   // n2
-                       number<LanesPerK>{},  // k0
-                       number<KVector>{}),   // k1
-            make_tuple(number<NumWarps*(WarpSize * KVector + kPad)>{},
-                       number<kKPerBlock>{},
-                       number<WarpSize * KVector + kPad>{},
-                       number<KVector>{},
-                       number<1>{}),
-            number<IBuf * GetSingleSmemElementSpaceSize<Problem>()>{},
-            number<KVector>{},
-            number<1>{});
+        constexpr auto k_lds_block_desc_0 =
+            make_naive_tensor_descriptor(make_tuple(number<NumIssues>{},  // n0
+                                                    number<LaneGroups>{}, // n1
+                                                    number<NumWarps>{},   // n2
+                                                    number<LanesPerK>{},  // k0
+                                                    number<KVector>{}),   // k1
+                                         make_tuple(number<NumWarps*(WarpSize * KVector + kPad)>{},
+                                                    number<kKPerBlock>{},
+                                                    number<WarpSize * KVector + kPad>{},
+                                                    number<KVector>{},
+                                                    number<1>{}),
+                                         number<KVector>{},
+                                         number<1>{});
 
-        // TODO this layout is hard coded, and will be used in async copy buffer view load
-        // in LDS the real layout is (bufs, N0, N2, N1*K0*K1)
+        // CRITICAL: Must match Load descriptor merge pattern (NumIssues, LaneGroups, NumWarps)
         constexpr auto k_lds_block_desc_issues_warps_lanes = transform_tensor_descriptor(
             k_lds_block_desc_0,
-            make_tuple(make_pass_through_transform(number<NumIssues>{}),
-                       make_pass_through_transform(number<NumWarps>{}),
-                       make_merge_transform(make_tuple(
-                           number<LaneGroups>{}, number<LanesPerK>{}, number<KVector>{}))),
-            make_tuple(sequence<0>{}, sequence<2>{}, sequence<1, 3, 4>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+            make_tuple(make_merge_transform(make_tuple(
+                           number<NumIssues>{}, number<LaneGroups>{}, number<NumWarps>{})),
+                       make_merge_transform(make_tuple(number<LanesPerK>{}, number<KVector>{}))),
+            make_tuple(sequence<0, 1, 2>{}, sequence<3, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
 
         return k_lds_block_desc_issues_warps_lanes;
     }
@@ -458,9 +461,8 @@ struct BlockFmhaV3PipelineDefaultPolicy
         return max(SingleKSize, SingleVSize);
     }
 
-    template <typename Problem, ck_tile::index_t IBuf = 0>
-    CK_TILE_DEVICE static constexpr auto
-    MakeVLdsStoreBlockDescriptor(ck_tile::number<IBuf> = ck_tile::number<0>{})
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeVLdsStoreBlockDescriptor()
     {
         using namespace ck_tile;
 
@@ -471,7 +473,6 @@ struct BlockFmhaV3PipelineDefaultPolicy
         constexpr index_t NumWarps   = Problem::BlockFmhaShape::NumWarps;
         constexpr index_t WarpSize   = ck_tile::get_warp_size();
 
-        [[maybe_unused]] constexpr index_t KPack = GetSmemVPackK<Problem>(); // this is for lds
         constexpr index_t KVector = GetAlignmentV<Problem>(); // this is for global load
         constexpr index_t kPad =
             kVLdsPadInBytes /
@@ -487,31 +488,27 @@ struct BlockFmhaV3PipelineDefaultPolicy
         constexpr index_t NumIssues = kNPerBlock / (LaneGroups * NumWarps);
         static_assert(NumIssues == kNPerBlock * kKPerBlock / (kBlockSize * KVector));
 
-        constexpr auto v_lds_block_desc_0 = make_naive_tensor_descriptor_with_offset(
-            make_tuple(number<NumIssues>{},  // n0
-                       number<LaneGroups>{}, // n1
-                       number<NumWarps>{},   // n2
-                       number<LanesPerK>{},  // k0
-                       number<KVector>{}),   // k1
-            make_tuple(number<NumWarps*(WarpSize * KVector + kPad)>{},
-                       number<kKPerBlock>{},
-                       number<WarpSize * KVector + kPad>{},
-                       number<KVector>{},
-                       number<1>{}),
-            number<(IBuf + 2) * GetSingleSmemElementSpaceSize<Problem>()>{},
-            number<KVector>{},
-            number<1>{});
+        constexpr auto v_lds_block_desc_0 =
+            make_naive_tensor_descriptor(make_tuple(number<NumIssues>{},  // n0
+                                                    number<LaneGroups>{}, // n1
+                                                    number<NumWarps>{},   // n2
+                                                    number<LanesPerK>{},  // k0
+                                                    number<KVector>{}),   // k1
+                                         make_tuple(number<NumWarps*(WarpSize * KVector + kPad)>{},
+                                                    number<kKPerBlock>{},
+                                                    number<WarpSize * KVector + kPad>{},
+                                                    number<KVector>{},
+                                                    number<1>{}),
+                                         number<KVector>{},
+                                         number<1>{});
 
-        // TODO this layout is hard coded, and will be used in async copy buffer view load
-        // in LDS the real layout is (bufs, N0, N2, N1*K0*K1)
         constexpr auto v_lds_block_desc_issues_warps_lanes = transform_tensor_descriptor(
             v_lds_block_desc_0,
-            make_tuple(make_pass_through_transform(number<NumIssues>{}),
-                       make_pass_through_transform(number<NumWarps>{}),
-                       make_merge_transform(make_tuple(
-                           number<LaneGroups>{}, number<LanesPerK>{}, number<KVector>{}))),
-            make_tuple(sequence<0>{}, sequence<2>{}, sequence<1, 3, 4>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+            make_tuple(make_merge_transform(make_tuple(
+                           number<NumIssues>{}, number<LaneGroups>{}, number<NumWarps>{})),
+                       make_merge_transform(make_tuple(number<LanesPerK>{}, number<KVector>{}))),
+            make_tuple(sequence<0, 1, 2>{}, sequence<3, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
 
         return v_lds_block_desc_issues_warps_lanes;
     }
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
index 659bdd995b..a1a98867c6 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
@@ -13,6 +13,7 @@ enum class BlockFmhaPipelineEnum
     QSKSVS,
     QRKSVS_ASYNC_TRLOAD,
     QRKSVS_ASYNC_TRLOAD_V3,
+    QRKSVS_HPAD,
 };
 
 template <BlockFmhaPipelineEnum>
@@ -40,4 +41,10 @@ struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD>
     static constexpr const char* name = "qr_async_trload";
 };
 
+template <>
+struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QRKSVS_HPAD>
+{
+    static constexpr const char* name = "qr_hpad";
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
index b207c62181..48c79177d4 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
@@ -14,7 +14,9 @@
 namespace ck_tile {
 
 // This pipeline is qkv all located in LDS
-template <typename Problem_, typename Policy_ = BlockFmhaPipelineQRKSVSDefaultPolicy>
+template <typename Problem_,
+          typename Policy_         = BlockFmhaPipelineQRKSVSDefaultPolicy,
+          bool PaddedVecLoadStore_ = false>
 struct BlockFmhaPipelineQRKSVS
 {
     using Problem               = remove_cvref_t<Problem_>;
@@ -54,17 +56,18 @@ struct BlockFmhaPipelineQRKSVS
 
     static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
 
-    static constexpr bool kIsGroupMode      = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ       = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK       = Problem::kPadSeqLenK;
-    static constexpr bool kPadHeadDimQ      = Problem::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV      = Problem::kPadHeadDimV;
-    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
-    static constexpr auto BiasEnum          = Problem::BiasEnum;
-    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
-    static constexpr bool kHasDropout       = Problem::kHasDropout;
-    static constexpr auto QScaleEnum        = Problem::QScaleEnum;
-    static constexpr bool kHasSink          = Problem::kHasSink;
+    static constexpr bool kIsGroupMode        = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ         = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK         = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ        = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV        = Problem::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap   = Problem::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum            = Problem::BiasEnum;
+    static constexpr bool kStoreLSE           = Problem::kStoreLSE;
+    static constexpr bool kHasDropout         = Problem::kHasDropout;
+    static constexpr auto QScaleEnum          = Problem::QScaleEnum;
+    static constexpr bool kHasSink            = Problem::kHasSink;
+    static constexpr bool kPaddedVecLoadStore = PaddedVecLoadStore_;
 
     static constexpr ck_tile::index_t kQKScaleGranularity = Problem::kQKScaleGranularity;
     static constexpr ck_tile::index_t kVScaleGranularity  = Problem::kVScaleGranularity;
@@ -80,23 +83,29 @@ struct BlockFmhaPipelineQRKSVS
                    (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
                     !kHasLogitsSoftCap)) ||
                   (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
+    static_assert(!kPaddedVecLoadStore || (kPadHeadDimQ && kPadHeadDimV),
+                  "padded vector load/store fast path only applies to padded head-dim kernels");
 
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
-    static constexpr index_t kAlignmentQ = kPadHeadDimQ ? numeric_traits<QDataType>::PackedSize
-                                                        : Policy::template GetAlignmentQ<Problem>();
-    static constexpr index_t kAlignmentK = kPadHeadDimQ ? numeric_traits<KDataType>::PackedSize
-                                                        : Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentQ = (kPadHeadDimQ && !kPaddedVecLoadStore)
+                                               ? numeric_traits<QDataType>::PackedSize
+                                               : Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK = (kPadHeadDimQ && !kPaddedVecLoadStore)
+                                               ? numeric_traits<KDataType>::PackedSize
+                                               : Policy::template GetAlignmentK<Problem>();
     static constexpr index_t kAlignmentV = []() {
         if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-            return kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+            return (kPadHeadDimV && !kPaddedVecLoadStore)
+                       ? 1
+                       : Policy::template GetAlignmentV<Problem>();
         else
             return kPadSeqLenK ? numeric_traits<VDataType>::PackedSize
                                : Policy::template GetAlignmentV<Problem>();
     }();
 
     static constexpr index_t kAlignmentO =
-        kPadHeadDimV ? 1 : Policy::template GetAlignmentO<Problem>();
+        (kPadHeadDimV && !kPaddedVecLoadStore) ? 1 : Policy::template GetAlignmentO<Problem>();
     static constexpr index_t kAlignmentBias =
         kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
     static constexpr index_t kAlignmentRandVal =
@@ -548,8 +557,25 @@ struct BlockFmhaPipelineQRKSVS
                 });
             }
 
-            const auto v_prefetch = load_tile(v_dram_window); // prefetch load v tile
-            {                                                 // tail
+            auto v_prefetch = decltype(load_tile(v_dram_window)){};
+            enum class VPrefetchPoint
+            {
+                BeforeGemm0Tail,
+                AfterGemm0Tail,
+                AfterSoftmax
+            };
+
+#if defined(__gfx11__) || defined(__gfx12__)
+            constexpr auto kVPrefetch =
+                kPadHeadDimV ? VPrefetchPoint::AfterSoftmax : VPrefetchPoint::AfterGemm0Tail;
+#else
+            constexpr auto kVPrefetch = VPrefetchPoint::BeforeGemm0Tail;
+#endif
+            if constexpr(kVPrefetch == VPrefetchPoint::BeforeGemm0Tail)
+            {
+                load_tile(v_prefetch, v_dram_window); // prefetch load v tile
+            }
+            { // tail
                 block_sync_lds();
                 run_gemm_0(number<k0_loops - 2>{});
                 block_sync_lds();
@@ -562,6 +588,10 @@ struct BlockFmhaPipelineQRKSVS
 
                 run_gemm_0(number<k0_loops - 1>{});
             }
+            if constexpr(kVPrefetch == VPrefetchPoint::AfterGemm0Tail)
+            {
+                load_tile(v_prefetch, v_dram_window);
+            }
             // dequant
             auto s_acc_element_func_ = [&s_acc_element_func, k_descale]() {
                 if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
@@ -819,6 +849,11 @@ struct BlockFmhaPipelineQRKSVS
                     randval_ptr, seq_offset, p_compute, randval_dram_window);
             }
 
+            if constexpr(kVPrefetch == VPrefetchPoint::AfterSoftmax)
+            {
+                load_tile(v_prefetch, v_dram_window);
+            }
+
             block_sync_lds();
             if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             {
@@ -1098,4 +1133,7 @@ struct BlockFmhaPipelineQRKSVS
     }
 };
 
+template <typename Problem_, typename Policy_ = BlockFmhaPipelineQRKSVSDefaultPolicy>
+using BlockFmhaPipelineQRKSVSHpad = BlockFmhaPipelineQRKSVS<Problem_, Policy_, true>;
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 581bcc19d4..ec62e52d0e 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -692,9 +692,6 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr index_t LaneGroups = WarpSize / LanesPerK; // within a wave
         constexpr index_t NumIssues  = kNPerBlock / (LaneGroups * NumWarps);
         static_assert(NumIssues == kNPerBlock * kKPerBlock / (kBlockSize * KVector));
-        // constexpr index_t SingleKSize = NumIssues * NumWarps * (WarpSize * KVector + kPad);
-        // constexpr index_t SingleVSize =
-        // MakeVLdsBlockDescriptor<Problem>().get_element_space_size();
         constexpr index_t BufferSize =
             GetSingleSmemElementSpaceSize<Problem>(); //  max(SingleKSize, SingleVSize);
 
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 52b2b86574..06ab134f85 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -456,9 +456,6 @@ struct MoeSortingKernel
     template <typename T, typename F, index_t wave_size_ = get_warp_size()>
     __device__ static constexpr T wave_reduce(T local, F reduce_f, number<wave_size_> = {})
     {
-        // constexpr int wave_size = 64;
-        // constexpr int reduce_stage = 6; // 1<<6=64
-        // clang-format off
         constexpr int reduce_stage = [](){
             if constexpr(wave_size_ == 2) return 1;
             else if constexpr(wave_size_ == 4) return 2;
@@ -1206,17 +1203,21 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size()
 template <typename T, typename F, index_t wave_size_ = get_warp_size()>
 CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
 {
-    // constexpr int wave_size = 64;
-    // constexpr int reduce_stage = 6; // 1<<6=64
-    // clang-format off
-    constexpr int reduce_stage = [](){
-        if constexpr(wave_size_ == 2) return 1;
-        else if constexpr(wave_size_ == 4) return 2;
-        else if constexpr(wave_size_ == 8) return 3;
-        else if constexpr(wave_size_ == 16) return 4;
-        else if constexpr(wave_size_ == 32) return 5;
-        else if constexpr(wave_size_ == 64) return 6;
-        else return 0;
+    constexpr int reduce_stage = []() {
+        if constexpr(wave_size_ == 2)
+            return 1;
+        else if constexpr(wave_size_ == 4)
+            return 2;
+        else if constexpr(wave_size_ == 8)
+            return 3;
+        else if constexpr(wave_size_ == 16)
+            return 4;
+        else if constexpr(wave_size_ == 32)
+            return 5;
+        else if constexpr(wave_size_ == 64)
+            return 6;
+        else
+            return 0;
     }();
     // clang-format on
     T v_local = local;
@@ -3047,53 +3048,6 @@ struct MoeSortingMultiPhaseKernel_P23
                 x_r = x_v;
 #endif
                 {
-#if 0
-#pragma unroll
-                    for(int j = 0; j < index_pack / 2; j++)
-                    {
-                        int i_token = i * kBlockSize * index_pack + threadIdx.x + j * kBlockSize;
-                        index_t x   = x_d[j];
-                        int i_topk  = x - 1;          // topk of this token
-                        int i_show  = x != 0 ? 1 : 0; // has this token or not
-                        int cumsum  = i_show;
-                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
-
-                        __syncthreads();
-                        if(lane_id == get_warp_size() - 1)
-                        {
-                            s[4 + wave_id] = cumsum;
-                        }
-                        __syncthreads();
-
-                        // reduce cross wave
-                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
-                            IndexType prev = s[4 + i_w];
-                            prev           = wave_id > i_w ? prev : 0; // mask out
-                            cumsum += prev;
-                        });
-                        cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == kBlockSize - 1)
-                        {
-                            s[0] = cumsum;
-                        }
-                        __syncthreads();
-
-                        int position = cumsum - i_show;
-                        prev_cumsum  = s[0]; // update the last cumsum
-
-                        if(i_show)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position] =
-                                MOE_SORTING_MOCK_ID(i_token, i_topk);
-#else
-                            p_sorted_token_ids[e_start + position] = i_token;
-#endif
-                            p_sorted_weights[e_start + position] =
-                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk];
-                        }
-                    }
-#endif
                     {
                         d_t i_topk;
                         d_t i_show;
@@ -3151,68 +3105,6 @@ struct MoeSortingMultiPhaseKernel_P23
                             }
                             position += i_show[j];
                         });
-
-#if 0
-                        int i_token = i * kBlockSize * index_pack + threadIdx.x * 2 + j * kBlockSize * 2;
-                        index_t x   = x_d[j];
-                        index_t x0  = static_cast<index_t>(x & 0xffff);
-                        index_t x1  = static_cast<index_t>(x >> 16);
-                        int i_topk_0  = x0 - 1;          // topk of this token
-                        int i_show_0  = x0 != 0 ? 1 : 0; // has this token or not
-                        int i_topk_1  = x1 - 1;          // topk of this token
-                        int i_show_1  = x1 != 0 ? 1 : 0; // has this token or not
-                        int cumsum  = i_show_0 + i_show_1;
-                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
-
-                        __syncthreads();
-                        if(lane_id == get_warp_size() - 1)
-                        {
-                            s[4 + wave_id] = cumsum;
-                        }
-                        __syncthreads();
-
-                        // reduce cross wave
-                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
-                            IndexType prev = s[4 + i_w];
-                            prev           = wave_id > i_w ? prev : 0; // mask out
-                            cumsum += prev;
-                        });
-                        cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == kBlockSize - 1)
-                        {
-                            s[0] = cumsum;
-                        }
-                        __syncthreads();
-
-                        int position_0 = cumsum - i_show_0 - i_show_1;
-                        prev_cumsum  = s[0]; // update the last cumsum
-
-                        if(i_show_0)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position_0] =
-                                MOE_SORTING_MOCK_ID(i_token, i_topk_0);
-#else
-                            p_sorted_token_ids[e_start + position_0] = i_token;
-#endif
-                            p_sorted_weights[e_start + position_0] =
-                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk_0];
-                        }
-
-                        int position_1 = cumsum - i_show_1;
-
-                        if(i_show_1)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position_1] =
-                                MOE_SORTING_MOCK_ID(i_token + 1, i_topk_1);
-#else
-                            p_sorted_token_ids[e_start + position_1] = i_token + 1;
-#endif
-                            p_sorted_weights[e_start + position_1] =
-                                p_weights[(i_token + 1) * kargs.topk_mdiv.divisor + i_topk_1];
-                        }
-#endif
                     }
                 }
             }
diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
index f70f4ddacc..828847091a 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
@@ -14,14 +14,6 @@
 
 namespace ck_tile {
 
-// template <typename Problem_, typename Policy_ = MoeSortingPolicy>
-// struct MoeSortingPipeline
-// {
-//     // TODO: this kernel only support warp per row
-//     using Problem    = remove_cvref_t<Problem_>;
-//     using Policy     = remove_cvref_t<Policy_>;
-//     using WeightType = typename Problem::WeightType;
-
 //     template <typename TopkIdWindow, typename WeightWindow>
 //     CK_TILE_DEVICE auto operator()(const TopkIdWindow& topk_id_window,
 //                                    const WeightWindow& weight_window,
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
index 108afd9b1c..0ac8efbc8d 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
@@ -213,38 +213,38 @@ struct BlockGemmARegBRegCRegV1
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                // read A warp tensor from A Block window
-                AWarpTensor a_warp_tensor;
-                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+        static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+            constexpr auto kIter = number<km[number<0>{}]>{};
+            constexpr auto mIter = number<km[number<1>{}]>{};
+            // read A warp tensor from A Block window
+            AWarpTensor a_warp_tensor;
+            a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read B warp tensor from B block tensor
-                    BWarpTensor b_warp_tensor;
-                    b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // read B warp tensor from B block tensor
+                BWarpTensor b_warp_tensor;
+                b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                    // read C warp tensor from C block tensor
-                    using c_iter_idx = std::
-                        conditional_t<TransposeC, sequence<nIter, mIter>, sequence<mIter, nIter>>;
-                    CWarpTensor c_warp_tensor;
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                // read C warp tensor from C block tensor
+                using c_iter_idx =
+                    std::conditional_t<TransposeC, sequence<nIter, mIter>, sequence<mIter, nIter>>;
+                CWarpTensor c_warp_tensor;
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                // warp GEMM
+                WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         });
     }
@@ -323,73 +323,69 @@ struct BlockGemmARegBRegCRegV1
 
         // hot loop with MX scaling and pre-packed int32_t scales:
         // Outer loops iterate over pack groups (scale tile indices)
-        static_for<0, KPackIterPerWarp, 1>{}([&](auto ikpack) {
-            static_for<0, MPackIterPerWarp, 1>{}([&](auto impack) {
-                // Get pre-packed int32_t A scale (already contains MXdlPack*KXdlPack e8m0_t)
-                auto scale_a_slice = scale_a_tensor.get_y_sliced_thread_data(
-                    sequence<ikpack, impack, 0>{}, sequence<1, 1, 1>{});
-                const int32_t a_scale_packed = bit_cast<int32_t>(scale_a_slice[number<0>{}]);
+        static_ford<sequence<KPackIterPerWarp, MPackIterPerWarp>>{}([&](auto ii) {
+            constexpr auto ikpack = number<ii[number<0>{}]>{};
+            constexpr auto impack = number<ii[number<1>{}]>{};
+            // Get pre-packed int32_t A scale (already contains MXdlPack*KXdlPack e8m0_t)
+            auto scale_a_slice = scale_a_tensor.get_y_sliced_thread_data(
+                sequence<ikpack, impack, 0>{}, sequence<1, 1, 1>{});
+            const int32_t a_scale_packed = bit_cast<int32_t>(scale_a_slice[number<0>{}]);
 
-                static_for<0, NPackIterPerWarp, 1>{}([&](auto inpack) {
-                    // Get pre-packed int32_t B scale
-                    auto scale_b_slice = scale_b_tensor.get_y_sliced_thread_data(
-                        sequence<ikpack, inpack, 0>{}, sequence<1, 1, 1>{});
-                    const int32_t b_scale_packed = bit_cast<int32_t>(scale_b_slice[number<0>{}]);
+            static_for<0, NPackIterPerWarp, 1>{}([&](auto inpack) {
+                // Get pre-packed int32_t B scale
+                auto scale_b_slice = scale_b_tensor.get_y_sliced_thread_data(
+                    sequence<ikpack, inpack, 0>{}, sequence<1, 1, 1>{});
+                const int32_t b_scale_packed = bit_cast<int32_t>(scale_b_slice[number<0>{}]);
 
-                    // Inner loops: issue MFMAs within the pack group using OpSel
-                    static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                        static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                            constexpr auto kIter = ikpack * KXdlPack + ikxdl;
-                            constexpr auto mIter = impack * MXdlPack + imxdl;
+                // Inner loops: issue MFMAs within the pack group using OpSel
+                static_ford<sequence<KXdlPack, MXdlPack>>{}([&](auto jj) {
+                    constexpr auto ikxdl = number<jj[number<0>{}]>{};
+                    constexpr auto imxdl = number<jj[number<1>{}]>{};
+                    constexpr auto kIter = ikpack * KXdlPack + ikxdl;
+                    constexpr auto mIter = impack * MXdlPack + imxdl;
 
-                            // read A warp tensor from A block tensor
-                            AWarpTensor a_warp_tensor;
-                            a_warp_tensor.get_thread_buffer() =
-                                a_block_tensor.get_y_sliced_thread_data(
-                                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                    // read A warp tensor from A block tensor
+                    AWarpTensor a_warp_tensor;
+                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                            // OpSel for A: selects byte within packed int32_t
-                            constexpr index_t kOpSelA = ikxdl * MXdlPack + imxdl;
+                    // OpSel for A: selects byte within packed int32_t
+                    constexpr index_t kOpSelA = ikxdl * MXdlPack + imxdl;
 
-                            static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                constexpr auto nIter = inpack * NXdlPack + inxdl;
+                    static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                        constexpr auto nIter = inpack * NXdlPack + inxdl;
 
-                                // read B warp tensor from B block tensor
-                                BWarpTensor b_warp_tensor;
-                                b_warp_tensor.get_thread_buffer() =
-                                    b_block_tensor.get_y_sliced_thread_data(
-                                        merge_sequences(sequence<nIter, kIter>{},
-                                                        b_warp_y_index_zeros),
-                                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
+                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                                // OpSel for B: selects byte within packed int32_t
-                                constexpr index_t kOpSelB = ikxdl * NXdlPack + inxdl;
+                        // OpSel for B: selects byte within packed int32_t
+                        constexpr index_t kOpSelB = ikxdl * NXdlPack + inxdl;
 
-                                // read C warp tensor from C block tensor
-                                using c_iter_idx = std::conditional_t<TransposeC,
-                                                                      sequence<nIter, mIter>,
-                                                                      sequence<mIter, nIter>>;
-                                CWarpTensor c_warp_tensor;
-                                c_warp_tensor.get_thread_buffer() =
-                                    c_block_tensor.get_y_sliced_thread_data(
-                                        merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
-                                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                        // read C warp tensor from C block tensor
+                        using c_iter_idx = std::conditional_t<TransposeC,
+                                                              sequence<nIter, mIter>,
+                                                              sequence<mIter, nIter>>;
+                        CWarpTensor c_warp_tensor;
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                                // warp GEMM with MX scaling using pre-packed scale and OpSel
-                                WarpGemm{}.template operator()<kOpSelA, kOpSelB>(c_warp_tensor,
-                                                                                 a_warp_tensor,
-                                                                                 b_warp_tensor,
-                                                                                 a_scale_packed,
-                                                                                 b_scale_packed);
+                        // warp GEMM with MX scaling using pre-packed scale and OpSel
+                        WarpGemm{}.template operator()<kOpSelA, kOpSelB>(c_warp_tensor,
+                                                                         a_warp_tensor,
+                                                                         b_warp_tensor,
+                                                                         a_scale_packed,
+                                                                         b_scale_packed);
 
-                                // write C warp tensor into C block tensor
-                                c_block_tensor.set_y_sliced_thread_data(
-                                    merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                                    c_warp_tensor.get_thread_buffer());
-                            });
-                        });
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
                     });
                 });
             });
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
index 960a685792..a559206b98 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
@@ -250,74 +250,74 @@ struct BlockGemmARegBRegCRegV2
         // hot loop:
         if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
         {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    // read A warp tensor from A Block window
-                    AWarpTensor a_warp_tensor;
-                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter = number<km[number<0>{}]>{};
+                constexpr auto mIter = number<km[number<1>{}]>{};
+                // read A warp tensor from A Block window
+                AWarpTensor a_warp_tensor;
+                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read B warp tensor from B block tensor
-                        BWarpTensor b_warp_tensor;
-                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<kIter, nIter>{}, b_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B block tensor
+                    BWarpTensor b_warp_tensor;
+                    b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<kIter, nIter>{}, b_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                        CWarpTensor c_warp_tensor;
-                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                    // warp GEMM
+                    WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                        // write C warp tensor into C block tensor
-                        c_block_tensor.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
             });
         }
         else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
         {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                        // read A warp tensor from A Block window
-                        AWarpTensor a_warp_tensor;
+            static_ford<sequence<MIterPerWarp, NIterPerWarp, KIterPerWarp>>{}([&](auto mnk) {
+                constexpr auto mIter = number<mnk[number<0>{}]>{};
+                constexpr auto nIter = number<mnk[number<1>{}]>{};
+                constexpr auto kIter = number<mnk[number<2>{}]>{};
 
-                        a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                // read A warp tensor from A Block window
+                AWarpTensor a_warp_tensor;
 
-                        // read B warp tensor from B block tensor
-                        BWarpTensor b_warp_tensor;
+                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                // read B warp tensor from B block tensor
+                BWarpTensor b_warp_tensor;
 
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+                b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                        // warp GEMM
-                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // write C warp tensor into C block tensor
-                        c_block_tensor.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                });
+                // warp GEMM
+                WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         }
     }
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
index 3302d149ca..1a61b69b34 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
@@ -36,9 +36,6 @@ struct BlockGemmARegBSmemCRegOneWarpV1
                 std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
             "wrong!");
 
-        // constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}];
-        // constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
-        // constexpr index_t KPerBlock = ABlockTensorTmp{}.get_lengths()[number<1>{}];
         constexpr index_t MPerBlock = BlockGemmShape::kM;
         constexpr index_t NPerBlock = BlockGemmShape::kN;
         constexpr index_t KPerBlock = BlockGemmShape::kK;
@@ -109,13 +106,13 @@ struct BlockGemmARegBSmemCRegOneWarpV1
             NIterPerWarp>
             b_warp_windows;
 
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+        static_ford<sequence<NIterPerWarp, KIterPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter         = number<nk[number<0>{}]>{};
+            constexpr auto kIter         = number<nk[number<1>{}]>{};
+            b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
 
-                move_tile_window(b_warp_windows(nIter)(kIter),
-                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(b_warp_windows(nIter)(kIter),
+                             {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 #endif
 
@@ -141,35 +138,35 @@ struct BlockGemmARegBSmemCRegOneWarpV1
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                // read A warp tensor from A block tensor
-                AWarpTensor a_warp_tensor;
+        static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+            constexpr auto kIter = number<km[number<0>{}]>{};
+            constexpr auto mIter = number<km[number<1>{}]>{};
+            // read A warp tensor from A block tensor
+            AWarpTensor a_warp_tensor;
 
-                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+            a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read B warp tensor from B Block window
-                    const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // read B warp tensor from B Block window
+                const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
 
-                    // read C warp tensor from C block tensor
-                    CWarpTensor c_warp_tensor;
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                // warp GEMM
+                WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         });
     }
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp
index 14d59ff373..0118258668 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp
@@ -116,13 +116,13 @@ struct BlockGemmARegBSmemCRegV1
             NIterPerWarp>
             b_warp_windows;
 
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+        static_ford<sequence<NIterPerWarp, KIterPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter         = number<nk[number<0>{}]>{};
+            constexpr auto kIter         = number<nk[number<1>{}]>{};
+            b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
 
-                move_tile_window(b_warp_windows(nIter)(kIter),
-                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(b_warp_windows(nIter)(kIter),
+                             {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 #endif
 
@@ -148,35 +148,35 @@ struct BlockGemmARegBSmemCRegV1
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                // read A warp tensor from A block tensor
-                AWarpTensor a_warp_tensor;
+        static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+            constexpr auto kIter = number<km[number<0>{}]>{};
+            constexpr auto mIter = number<km[number<1>{}]>{};
+            // read A warp tensor from A block tensor
+            AWarpTensor a_warp_tensor;
 
-                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+            a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read B warp tensor from B Block window
-                    const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // read B warp tensor from B Block window
+                const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
 
-                    // read C warp tensor from C block tensor
-                    CWarpTensor c_warp_tensor;
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                // warp GEMM
+                WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         });
     }
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
index 2280f6f875..3a7c0362f7 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
@@ -19,30 +19,7 @@ struct BlockGemmARegBSmemCRegV1DefaultPolicy
                      std::is_same_v<typename Problem::BDataType, half_t> &&
                      std::is_same_v<typename Problem::CDataType, float>)
         {
-#if 0
-            constexpr index_t kBlockSize = Problem::kBlockSize;
-
-            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-            constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-            static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
-
-            constexpr index_t NumWarp = kBlockSize / get_warp_size();
-
-            // FIXME
-            if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
-                         kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-            else
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-#else
             return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
-#endif
         }
         else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
                           std::is_same_v<typename Problem::BDataType, bf16_t> &&
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
index 0aa7509b1e..d292cade24 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
@@ -103,13 +103,13 @@ struct BlockGemmARegBSmemCRegV2
             NIterPerWarp>
             b_warp_windows;
 
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+        static_ford<sequence<NIterPerWarp, KIterPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter         = number<nk[number<0>{}]>{};
+            constexpr auto kIter         = number<nk[number<1>{}]>{};
+            b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
 
-                move_tile_window(b_warp_windows(nIter)(kIter),
-                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(b_warp_windows(nIter)(kIter),
+                             {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 #endif
 
@@ -135,36 +135,36 @@ struct BlockGemmARegBSmemCRegV2
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                // read B warp tensor from B Block window
-                const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+        static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+            constexpr auto kIter = number<kn[number<0>{}]>{};
+            constexpr auto nIter = number<kn[number<1>{}]>{};
+            // read B warp tensor from B Block window
+            const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
 
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    // read A warp tensor from A block tensor
-                    AWarpTensor a_warp_tensor;
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A block tensor
+                AWarpTensor a_warp_tensor;
 
-                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                    // read C warp tensor from C block tensor
-                    CWarpTensor c_warp_tensor;
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
-                    // WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor_array[nIter]);
+                // warp GEMM
+                WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                // WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor_array[nIter]);
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         });
     }
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp
index b8290c95d8..0b1cea9425 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp
@@ -16,30 +16,7 @@ struct BlockGemmARegBSmemCRegV2DefaultPolicy
     CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
     {
 
-#if 0
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
-
-        constexpr index_t NumWarp = kBlockSize / get_warp_size();
-
-        // FIXME
-        if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
-                     kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
-        {
-            return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-        }
-        else
-        {
-            return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-        }
-#else
         return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
-#endif
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp
index 2ba01d91c5..9ffc9f2070 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp
@@ -90,13 +90,13 @@ struct BlockGemmARegBSmemCRegV2R1
             NIterPerWarp>
             b_warp_windows;
 
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+        static_ford<sequence<NIterPerWarp, KIterPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter         = number<nk[number<0>{}]>{};
+            constexpr auto kIter         = number<nk[number<1>{}]>{};
+            b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
 
-                move_tile_window(b_warp_windows(nIter)(kIter),
-                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(b_warp_windows(nIter)(kIter),
+                             {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 
         // check C-block-distribution
@@ -126,43 +126,43 @@ struct BlockGemmARegBSmemCRegV2R1
             NIterPerWarp>
             b_warp_tensors;
 
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                b_warp_tensors(nIter)(kIter) = load_tile(b_warp_windows(nIter)(kIter));
-            });
+        static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+            constexpr auto kIter         = number<kn[number<0>{}]>{};
+            constexpr auto nIter         = number<kn[number<1>{}]>{};
+            b_warp_tensors(nIter)(kIter) = load_tile(b_warp_windows(nIter)(kIter));
         });
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                // read B warp tensor from B Block window
-                const auto b_warp_tensor = b_warp_tensors(nIter)(kIter);
+        static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+            constexpr auto kIter = number<kn[number<0>{}]>{};
+            constexpr auto nIter = number<kn[number<1>{}]>{};
+            // read B warp tensor from B Block window
+            const auto b_warp_tensor = b_warp_tensors(nIter)(kIter);
 
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    // read A warp tensor from A block tensor
-                    AWarpTensor a_warp_tensor;
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A block tensor
+                AWarpTensor a_warp_tensor;
 
-                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                    // read C warp tensor from C block tensor
-                    CWarpTensor c_warp_tensor;
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
-                    // WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor_array[nIter]);
+                // warp GEMM
+                WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                // WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor_array[nIter]);
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         });
 
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp
index b1223f8755..2b750c75b3 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp
@@ -116,13 +116,13 @@ struct BlockGemmASmemBRegCRegV1
             MIterPerWarp>
             a_warp_windows;
 
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter         = number<mk[number<0>{}]>{};
+            constexpr auto kIter         = number<mk[number<1>{}]>{};
+            a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
 
-                move_tile_window(a_warp_windows(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(a_warp_windows(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 #endif
 
@@ -148,34 +148,34 @@ struct BlockGemmASmemBRegCRegV1
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                // read A warp tensor from A Block window
-                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read B warp tensor from B block tensor
-                    BWarpTensor b_warp_tensor;
+        static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+            constexpr auto kIter = number<km[number<0>{}]>{};
+            constexpr auto mIter = number<km[number<1>{}]>{};
+            // read A warp tensor from A Block window
+            const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // read B warp tensor from B block tensor
+                BWarpTensor b_warp_tensor;
 
-                    b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                    // read C warp tensor from C block tensor
-                    CWarpTensor c_warp_tensor;
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                // warp GEMM
+                WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         });
     }
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
index 29022e764f..0622cc624f 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
@@ -19,30 +19,7 @@ struct BlockGemmASmemBRegCRegV1DefaultPolicy
                      std::is_same_v<typename Problem::BDataType, half_t> &&
                      std::is_same_v<typename Problem::CDataType, float>)
         {
-#if 0
-            constexpr index_t kBlockSize = Problem::kBlockSize;
-
-            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-            constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-            static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
-
-            constexpr index_t NumWarp = kBlockSize / get_warp_size();
-
-            // FIXME
-            if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
-                         kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-            else
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-#else
             return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
-#endif
         }
         else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
                           std::is_same_v<typename Problem::BDataType, bf16_t> &&
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
index 6eedfabaf8..32776b786d 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
@@ -85,13 +85,13 @@ struct BlockGemmASmemBSmemCRegV1
             MIterPerWarp>
             a_warp_windows;
 
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter         = number<mk[number<0>{}]>{};
+            constexpr auto kIter         = number<mk[number<1>{}]>{};
+            a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
 
-                move_tile_window(a_warp_windows(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(a_warp_windows(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 #endif
 
@@ -120,13 +120,13 @@ struct BlockGemmASmemBSmemCRegV1
             NIterPerWarp>
             b_warp_windows;
 
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+        static_ford<sequence<NIterPerWarp, KIterPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter         = number<nk[number<0>{}]>{};
+            constexpr auto kIter         = number<nk[number<1>{}]>{};
+            b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
 
-                move_tile_window(b_warp_windows(nIter)(kIter),
-                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(b_warp_windows(nIter)(kIter),
+                             {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 #endif
 
@@ -138,31 +138,31 @@ struct BlockGemmASmemBSmemCRegV1
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                // read A warp tensor from A block window
-                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+        static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+            constexpr auto kIter = number<km[number<0>{}]>{};
+            constexpr auto mIter = number<km[number<1>{}]>{};
+            // read A warp tensor from A block window
+            const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
 
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read B warp tensor from B Block window
-                    const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // read B warp tensor from B Block window
+                const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
 
-                    // read C warp tensor from C block tensor
-                    CWarpTensor c_warp_tensor;
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                // warp GEMM
+                WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         });
     }
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_mx_areg_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_mx_areg_bsmem_creg_v1.hpp
index 5dde03912a..9ad8c4cc97 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_mx_areg_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_mx_areg_bsmem_creg_v1.hpp
@@ -165,61 +165,60 @@ struct BlockGemmMxARegBSmemCRegV1
             uniform_sequence_gen_t<BScaleWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                auto b_warp_window = b_warp_window_tmp;
-                move_tile_window(
-                    b_warp_window,
-                    {nIter * (NPerBlock / NIterPerWarp), kIter * (KPerBlock / KIterPerWarp)});
-                // read B warp tensor from B Block window
-                const auto b_warp_tensor = load_tile(b_warp_window);
+        static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+            constexpr auto kIter = number<kn[number<0>{}]>{};
+            constexpr auto nIter = number<kn[number<1>{}]>{};
+            auto b_warp_window   = b_warp_window_tmp;
+            move_tile_window(
+                b_warp_window,
+                {nIter * (NPerBlock / NIterPerWarp), kIter * (KPerBlock / KIterPerWarp)});
+            // read B warp tensor from B Block window
+            const auto b_warp_tensor = load_tile(b_warp_window);
 
-                BScaleWarpTensor b_scale_warp_tensor;
+            BScaleWarpTensor b_scale_warp_tensor;
 
-                b_scale_warp_tensor.get_thread_buffer() =
-                    b_scale_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<nIter / NIterPack, nIter % NIterPack, kIter>{},
-                                        b_scale_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1, 1>{}, b_scale_warp_y_lengths));
+            b_scale_warp_tensor.get_thread_buffer() = b_scale_block_tensor.get_y_sliced_thread_data(
+                merge_sequences(sequence<nIter / NIterPack, nIter % NIterPack, kIter>{},
+                                b_scale_warp_y_index_zeros),
+                merge_sequences(sequence<1, 1, 1>{}, b_scale_warp_y_lengths));
 
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    // read A warp tensor from A block tensor
-                    AWarpTensor a_warp_tensor;
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A block tensor
+                AWarpTensor a_warp_tensor;
 
-                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                    AScaleWarpTensor a_scale_warp_tensor;
+                AScaleWarpTensor a_scale_warp_tensor;
 
-                    a_scale_warp_tensor.get_thread_buffer() =
-                        a_scale_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, kIter>{}, a_scale_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, a_scale_warp_y_lengths));
+                a_scale_warp_tensor.get_thread_buffer() =
+                    a_scale_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, kIter>{}, a_scale_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_scale_warp_y_lengths));
 
-                    // read C warp tensor from C block tensor
-                    CWarpTensor c_warp_tensor;
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter / NIterPack, nIter % NIterPack>{},
-                                        c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter / NIterPack, nIter % NIterPack>{},
+                                    c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WarpGemm{}.template operator()<0, 0>(
-                        c_warp_tensor,
-                        a_warp_tensor,
-                        b_warp_tensor,
-                        int32_t(a_scale_warp_tensor.get_thread_buffer()[0]),
-                        int32_t(b_scale_warp_tensor.get_thread_buffer()[0]));
+                // warp GEMM
+                WarpGemm{}.template operator()<0, 0>(
+                    c_warp_tensor,
+                    a_warp_tensor,
+                    b_warp_tensor,
+                    int32_t(a_scale_warp_tensor.get_thread_buffer()[0]),
+                    int32_t(b_scale_warp_tensor.get_thread_buffer()[0]));
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter / NIterPack, nIter % NIterPack>{},
-                                        c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter / NIterPack, nIter % NIterPack>{},
+                                    c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         });
     }
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index f7f5cd33db..2b64f6e340 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -239,39 +239,39 @@ struct BlockUniversalGemmAsBsCr
                           "C block tensor data type!");
 
             // hot loop:
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    // read A warp tensor from A block tensor
-                    AWarpTensor a_warp_tensor;
+            static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIter = number<km[number<0>{}]>{};
+                constexpr auto mIter = number<km[number<1>{}]>{};
+                // read A warp tensor from A block tensor
+                AWarpTensor a_warp_tensor;
 
-                    a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read B warp tensor from B block tensor
-                        BWarpTensor b_warp_tensor;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B block tensor
+                    BWarpTensor b_warp_tensor;
 
-                        b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
-                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                    b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
+                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
 
-                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // warp GEMM
-                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                    // warp GEMM
+                    WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                        // write C warp tensor into C block tensor
-                        c_block_tensor.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
             });
         }
@@ -392,63 +392,59 @@ struct BlockUniversalGemmAsBsCr
                         0); // Prevents instruction reordering across this boundary
                 }
 
-                static_for<0, KInnerLoopIter, 1>{}([&](auto kInnerIter) {
-                    static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                        // read A warp tensor from A block tensor
-                        AWarpTensor a_warp_tensor;
+                static_ford<sequence<KInnerLoopIter, MIterPerWarp>>{}([&](auto km) {
+                    constexpr auto kInnerIter = number<km[number<0>{}]>{};
+                    constexpr auto mIter      = number<km[number<1>{}]>{};
+                    // read A warp tensor from A block tensor
+                    AWarpTensor a_warp_tensor;
 
-                        a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, kInnerIter>{}, a_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
-                        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                            // read B warp tensor from B block tensor
-                            BWarpTensor b_warp_tensor;
+                    a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, kInnerIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
 
-                            b_warp_tensor.get_thread_buffer() =
-                                b_warp_tile_.get_y_sliced_thread_data(
-                                    merge_sequences(sequence<nIter, kInnerIter>{},
-                                                    b_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
-                            // read C warp tensor from C block tensor-
-                            CWarpTensor c_warp_tensor;
+                        b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kInnerIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                        // read C warp tensor from C block tensor-
+                        CWarpTensor c_warp_tensor;
 
-                            c_warp_tensor.get_thread_buffer() =
-                                c_block_tensor.get_y_sliced_thread_data(
-                                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                            // The block_sync_lds() here performs double duty:
-                            // A) safeguard against data hazard because barrier from
-                            // blockwise_gemm is moved here B) reduce VMEM FIFO congestion
-                            // by applying small delays to different wavefronts It is
-                            // performed near the end of MAC cluster to minimize lgkmcnt
-                            // penalty
-                            if constexpr(kIter.value == KRepeat - 1 &&
-                                         kInnerIter.value == KInnerLoopIter - 1 &&
-                                         mIter.value == MIterPerWarp - 1 &&
-                                         nIter.value == NIterPerWarp - 1)
-                            {
-                                __builtin_amdgcn_sched_barrier(0);
-                                block_sync_lds();
-                                __builtin_amdgcn_sched_barrier(0);
-                            }
-                            // warp GEMM
-                            WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                        // The block_sync_lds() here performs double duty:
+                        // A) safeguard against data hazard because barrier from
+                        // blockwise_gemm is moved here B) reduce VMEM FIFO congestion
+                        // by applying small delays to different wavefronts It is
+                        // performed near the end of MAC cluster to minimize lgkmcnt
+                        // penalty
+                        if constexpr(kIter.value == KRepeat - 1 &&
+                                     kInnerIter.value == KInnerLoopIter - 1 &&
+                                     mIter.value == MIterPerWarp - 1 &&
+                                     nIter.value == NIterPerWarp - 1)
+                        {
+                            __builtin_amdgcn_sched_barrier(0);
+                            block_sync_lds();
+                            __builtin_amdgcn_sched_barrier(0);
+                        }
+                        // warp GEMM
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                            // write C warp tensor into C block tensor
-                            c_block_tensor.set_y_sliced_thread_data(
-                                merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                                merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                                c_warp_tensor.get_thread_buffer());
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
 
-                            if constexpr(kInnerIter.value == 0 && mIter.value == 0 &&
-                                         nIter.value == 0)
-                            {
-                                __builtin_amdgcn_sched_barrier(0);
-                                __builtin_amdgcn_s_setprio(1);
-                                __builtin_amdgcn_sched_barrier(0);
-                            }
-                        });
+                        if constexpr(kInnerIter.value == 0 && mIter.value == 0 && nIter.value == 0)
+                        {
+                            __builtin_amdgcn_sched_barrier(0);
+                            __builtin_amdgcn_s_setprio(1);
+                            __builtin_amdgcn_sched_barrier(0);
+                        }
                     });
                 });
 
diff --git a/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp b/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp
index 4fc180b42b..45602f3064 100644
--- a/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp
+++ b/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp
@@ -156,55 +156,54 @@ struct BlockWeightPreshuffleASmemBRegCReg
             uniform_sequence_gen_t<BFlatDistribution::NDimY, 0>{};
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read C warp tensor from C block tensor
-                    BWarpTensor b_warp_tensor;
-                    CWarpTensor c_warp_tensor;
+        static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+            constexpr auto kIter     = number<km[number<0>{}]>{};
+            constexpr auto mIter     = number<km[number<1>{}]>{};
+            constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // read C warp tensor from C block tensor
+                BWarpTensor b_warp_tensor;
+                CWarpTensor c_warp_tensor;
 
-                    b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<nIter, kIter>{},
-                                        typename sequence_split<decltype(b_block_y_index_zeros),
-                                                                2>::right_type{}),
-                        merge_sequences(
-                            sequence<1, 1>{},
-                            typename sequence_split<decltype(b_block_y_lengths), 2>::right_type{}));
+                b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(
+                        sequence<nIter, kIter>{},
+                        typename sequence_split<decltype(b_block_y_index_zeros), 2>::right_type{}),
+                    merge_sequences(
+                        sequence<1, 1>{},
+                        typename sequence_split<decltype(b_block_y_lengths), 2>::right_type{}));
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WarpGemm{}(
-                        c_warp_tensor, preloaded_a_warp_tensor(number<AwarpIter>{}), b_warp_tensor);
+                // warp GEMM
+                WarpGemm{}(
+                    c_warp_tensor, preloaded_a_warp_tensor(number<AwarpIter>{}), b_warp_tensor);
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
 
-                    __builtin_amdgcn_sched_barrier(0x7F6);
-                });
-                // preload next A from lds
-                if constexpr((kIter * MIterPerWarp + mIter) <
-                             (KIterPerWarp * MIterPerWarp - m_preload))
-                {
-                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-
-                    load_tile(preloaded_a_warp_tensor(number<AwarpIter>{}),
-                              a_load_windows[number<AkIter>{}][number<AmIter>{}]);
-                }
-
-                // barrier
-                if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                {
-                    block_sync_lds();
-                }
+                __builtin_amdgcn_sched_barrier(0x7F6);
             });
+            // preload next A from lds
+            if constexpr((kIter * MIterPerWarp + mIter) < (KIterPerWarp * MIterPerWarp - m_preload))
+            {
+                constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+
+                load_tile(preloaded_a_warp_tensor(number<AwarpIter>{}),
+                          a_load_windows[number<AkIter>{}][number<AmIter>{}]);
+            }
+
+            // barrier
+            if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+            {
+                block_sync_lds();
+            }
         });
     }
 };
diff --git a/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp
index 49c26fab6c..08a7e7a3ea 100644
--- a/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp
@@ -88,28 +88,28 @@ struct BlockWeightPreshuffleASmemBSmemCRegV1
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                // read A warp tensor from A block window
-                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+        static_ford<sequence<KIterPerWarp, MIterPerWarp>>{}([&](auto km) {
+            constexpr auto kIter = number<km[number<0>{}]>{};
+            constexpr auto mIter = number<km[number<1>{}]>{};
+            // read A warp tensor from A block window
+            const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
 
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read C warp tensor from C block tensor
-                    CWarpTensor c_warp_tensor;
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // read C warp tensor from C block tensor
+                CWarpTensor c_warp_tensor;
 
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor(nIter)(kIter));
+                // warp GEMM
+                WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor(nIter)(kIter));
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
-                });
+                // write C warp tensor into C block tensor
+                c_block_tensor.set_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                    c_warp_tensor.get_thread_buffer());
             });
         });
     }
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp
index ac83babeb6..13137a6cff 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp
@@ -119,7 +119,7 @@ struct StreamKKernel
 
     struct StreamKKernelArgs : ck_tile::UniversalGemmKernelArgs<>
     {
-        StreamKKernelArgs(const StreamKHostArgs& host_args, index_t grid)
+        StreamKKernelArgs(const StreamKHostArgs& host_args, index_t max_active_wgs)
             : UniversalGemmKernelArgs{host_args.as_ptr,
                                       host_args.bs_ptr,
                                       host_args.ds_ptr,
@@ -135,7 +135,8 @@ struct StreamKKernel
               // The workspace pointer is set to nullptr because we must first
               // instantiate the TilePartitioner to get the necessary size
               workspace_ptr{nullptr},
-              tile_partitioner{TilePartitioner{host_args.M, host_args.N, host_args.K, grid}}
+              tile_partitioner{
+                  TilePartitioner{host_args.M, host_args.N, host_args.K, max_active_wgs}}
 
         {
         }
@@ -153,6 +154,7 @@ struct StreamKKernel
 
     using KernelArgs = StreamKKernelArgs;
     using Kernel     = StreamKKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
+    using StreamKOps = StreamKReductionOps<TilePartitioner, GemmPipeline, StreamKKernelArgs>;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
@@ -206,9 +208,9 @@ struct StreamKKernel
                                                          int num_cu    = NumCU(),
                                                          int occupancy = Occupancy())
     {
-        const index_t grid = num_cu * occupancy;
+        const index_t max_active_wgs = num_cu * occupancy;
 
-        return StreamKKernelArgs{host_args, grid};
+        return StreamKKernelArgs{host_args, max_active_wgs};
     }
 
     template <bool UseDefaultScheduler = true>
@@ -313,231 +315,6 @@ struct StreamKKernel
             {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, smem_ptr_0, kargs, num_loop, i_m, i_n, k_size);
     }
 
-    /**
-     *@brief Signals that the current thread block(CTA) has completed storing its partial
-     * results.
-     * @param kargs Kernel arguments, including the workspace pointer.
-     * @param cta_idx The index of the current thread block (CTA).
-     * @note This function utilizes a scalar store to write to the flags buffer.
-     */
-    CK_TILE_DEVICE void SignalStorePartialDone(const StreamKKernelArgs& kargs,
-                                               index_t cta_idx) const
-    {
-        auto* sk_flags_ptr = static_cast<index_t*>(kargs.workspace_ptr);
-        index_t offset     = cta_idx * sizeof(index_t);
-
-        asm volatile("s_mov_b32 m0, %2\n\t"
-                     // Depending on the architecture, the GLC flag will bypass the approproriate
-                     // cache level(s) to ensure the write is visible to other workgroups. See the
-                     // appropriate ISA for details about the GLC modifier.
-                     "s_store_dword %0, %1, %2 glc\n\t"
-                     "s_waitcnt lgkmcnt(0)" // Wait for the store to complete
-                     :
-                     : "s"(1), "s"(sk_flags_ptr), "s"(offset)
-                     : "memory");
-    }
-
-    /**
-     * @brief Waits for the thread block (cta_idx) to complete storing its partial results.
-     * @param kargs Kernel arguments, including the workspace pointer.
-     * @param cta_idx The index of the thread block (CTA).
-     * @note This function utilizes a scalar load to read from the flags
-     * buffer.
-     */
-    CK_TILE_DEVICE void WaitStorePartialDone(const StreamKKernelArgs& kargs, index_t cta_idx) const
-    {
-        auto* sk_flags_ptr = static_cast<index_t*>(kargs.workspace_ptr);
-        index_t result;
-        index_t offset = cta_idx * sizeof(index_t);
-
-        do
-        {
-            asm volatile("s_mov_b32 m0, %2\n\t"
-                         // Depending on the architecture, the GLC flag will bypass the
-                         // approproriate cache level(s) to avoid reading stale flags. See the
-                         // appropriate ISA for details about the GLC modifier.
-                         "s_load_dword %0, %1, %2 glc\n\t"
-                         "s_waitcnt lgkmcnt(0)" // Wait for the load to complete
-                         : "=s"(result)
-                         : "s"(sk_flags_ptr), "s"(offset)
-                         : "memory");
-        } while(result != 1);
-    }
-
-    /**
-     * @brief Adds the values of a block tile to an output block tile.
-     * @param in_out_block_tile The output block tile to which values are added.
-     * @param in_block_tile The input block tile whose values are added.
-     * @note This function iterates over the distributed spans of the block tiles and updates
-     * the output block tile with accumulated values.
-     */
-    template <typename OAccTile>
-    CK_TILE_DEVICE void AddBlockTile(OAccTile& in_out_block_tile,
-                                     const OAccTile& in_block_tile) const
-    {
-        using BlockType        = remove_cvref_t<decltype(in_out_block_tile)>;
-        constexpr auto o_spans = BlockType::get_distributed_spans();
-        sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
-            sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
-                constexpr auto idx     = make_tuple(idx0, idx1);
-                in_out_block_tile(idx) = in_out_block_tile[idx] + in_block_tile[idx];
-            });
-        });
-    }
-
-    /**
-     * @brief Loads a partial block tile from the workspace buffer.
-     * @param kargs Kernel arguments, including the workspace pointer.
-     * @param cta_idx The index of the thread block (CTA).
-     * @param c_block_tile_dist The tile distribution for the block.
-     * @return The loaded partial block tile.
-     * @note This function calculates the buffer pointer and uses the tile distribution for
-     * loading the partial block tile.
-     */
-    template <typename DataType, typename OAccTileDist>
-    CK_TILE_DEVICE auto LoadPartial(const StreamKKernelArgs& kargs,
-                                    index_t cta_idx,
-                                    const OAccTileDist& c_block_tile_dist) const
-    {
-        const auto c_block_tile_buffer_size =
-            TilePartitioner::MPerBlock * TilePartitioner::NPerBlock * sizeof(DataType);
-        void* partial_buffer_ptr = static_cast<char*>(kargs.workspace_ptr) +
-                                   kargs.tile_partitioner.get_flags_buffer_size() +
-                                   cta_idx * c_block_tile_buffer_size;
-
-        const auto& partial_tensor_view = make_naive_tensor_view<address_space_enum::global>(
-            static_cast<DataType*>(partial_buffer_ptr),
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            make_tuple(TilePartitioner::NPerBlock, 1),
-            number<GetVectorSizePartials()>{},
-            number<1>{});
-
-        auto partial_tile_window = make_tile_window(
-            partial_tensor_view,
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            {0, 0},
-            MakePartialsDistribution());
-
-        auto partials_tile = load_tile(partial_tile_window);
-
-        // Since the partials distribution is not the same as the C block distribution, we must
-        // describe the contents in the partials tile with the C block distribution.
-        // Note: The data assigned to threads does not change between distributions.
-        auto partials_tile_with_c_distr = make_static_distributed_tensor<DataType>(
-            c_block_tile_dist, partials_tile.get_thread_buffer());
-
-        return partials_tile_with_c_distr;
-    }
-
-    /**
-     * @brief Returns the vector size to be used for reading from and writing to partials.
-     * @return The vector size
-     */
-    CK_TILE_DEVICE static constexpr index_t GetVectorSizePartials()
-    {
-        // We use kCM1PerLane from the C register layout of the warp GEMM which corresponds to the
-        // maximum vector width
-        return WarpGemm::WarpGemmAttribute::Impl::kCM1PerLane;
-    }
-
-    /**
-     * @brief Returns distribution used for reading from and writing to partials.
-     * @return The distribution.
-     * @note This will result in optimized reads from and writes to partials when C is row major.
-     * Additional functionality should be added to ensure optimized accesses to partials when C is
-     * column major. Since the C-Shuffle epilogue only supports C as row major, this is not a
-     * current limitation.
-     */
-    CK_TILE_DEVICE static constexpr auto MakePartialsDistribution()
-    {
-        // Create the encoding to describe waves within a block
-        constexpr index_t m_warp = BlockGemmShape::BlockWarps::at(number<0>{});
-        constexpr index_t n_warp = BlockGemmShape::BlockWarps::at(number<1>{});
-
-        constexpr index_t m_iter_per_warp = TilePartitioner::MPerBlock / (m_warp * WarpGemm::kM);
-        constexpr index_t n_iter_per_warp = TilePartitioner::NPerBlock / (n_warp * WarpGemm::kN);
-
-        constexpr auto partials_outer_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<m_iter_per_warp, m_warp>, sequence<n_iter_per_warp, n_warp>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<1, 2>,
-            sequence<0, 0>>{};
-
-        // Create the encoding to describe threads within a wave
-        constexpr index_t vector_size         = GetVectorSizePartials();
-        constexpr index_t m_warp_repeat       = WarpGemm::WarpGemmAttribute::Impl::kCM0PerLane;
-        constexpr index_t warp_tile_n_threads = WarpGemm::kN / vector_size;
-        constexpr index_t warp_tile_m_threads = get_warp_size() / warp_tile_n_threads;
-
-        // This inner encoding ensures that contiguous threads perform vectorized writes along the
-        // same row in C.
-        constexpr auto partials_inner_dstr_encoding =
-            tile_distribution_encoding<sequence<>,
-                                       tuple<sequence<m_warp_repeat, warp_tile_m_threads>,
-                                             sequence<warp_tile_n_threads, vector_size>>,
-                                       tuple<sequence<1, 2>>,
-                                       tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 1>>{};
-
-        // Combine the outer and inner encoding
-        constexpr auto partials_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            partials_outer_dstr_encoding, partials_inner_dstr_encoding);
-
-        return make_static_tile_distribution(partials_dstr_encode);
-    }
-
-    /**
-     * @brief Stores a partial block tile to the workspace buffer.
-     * @param kargs Kernel arguments, including the workspace pointer.
-     * @param cta_idx The index of the thread block (CTA).
-     * @param c_block_tile The block tile to be stored.
-     * @note This function calculates the buffer pointer and uses the tile window for storing
-     * the partial block tile.
-     */
-    template <typename OAccTile>
-    CK_TILE_DEVICE void StorePartial(const StreamKKernelArgs& kargs,
-                                     index_t cta_idx,
-                                     const OAccTile& c_block_tile) const
-    {
-        const auto c_block_tile_buffer_size = TilePartitioner::MPerBlock *
-                                              TilePartitioner::NPerBlock *
-                                              sizeof(typename OAccTile::DataType);
-        void* partial_buffer_ptr = static_cast<char*>(kargs.workspace_ptr) +
-                                   kargs.tile_partitioner.get_flags_buffer_size() +
-                                   cta_idx * c_block_tile_buffer_size;
-
-        const auto& partial_tensor_view = make_naive_tensor_view<
-            address_space_enum::global,
-            memory_operation_enum::set,
-            StreamKCoherency<decltype(core::arch::get_compiler_target())>::BUFFER_COHERENCE>(
-            static_cast<typename OAccTile::DataType*>(partial_buffer_ptr),
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            make_tuple(TilePartitioner::NPerBlock, 1),
-            number<GetVectorSizePartials()>{},
-            number<1>{});
-
-        auto partial_tile_window = make_tile_window(
-            partial_tensor_view,
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            {0, 0},
-            MakePartialsDistribution());
-
-        // Since the C block distribution is not the same as the partials distribution, we must
-        // describe the contents in the c_block_tile with the partials distribution.
-        // Note: The data assigned to threads does not change between distributions.
-        auto c_with_partials_dist = make_static_distributed_tensor<typename OAccTile::DataType>(
-            MakePartialsDistribution(), c_block_tile.get_thread_buffer());
-
-        store_tile(partial_tile_window, c_with_partials_dist);
-        // Wait for all vector stores for this wavefront to complete
-        s_waitcnt</*vmcnt*/ 0, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
-        // Wait for all wavefronts in this workgroup to arrive here before continuing
-        __builtin_amdgcn_s_barrier();
-    }
-
     /**
      * @brief Runs the main Stream - K algorithm.
      * @param kargs Stream - K kernel arguments.
@@ -550,6 +327,7 @@ struct StreamKKernel
     CK_TILE_DEVICE
     void StreamKGemm(StreamKKernelArgs& kargs, index_t cta_idx, void* smem_ptr_0) const
     {
+        const StreamKOps sk_ops{};
         index_t iter_start, iter_end;
         kargs.tile_partitioner.get_iter_boundaries(iter_start, iter_end, cta_idx);
 
@@ -630,8 +408,8 @@ struct StreamKKernel
                 {
                     if(!tile_started)
                     {
-                        StorePartial(kargs, cta_idx, c_block_tile);
-                        SignalStorePartialDone(kargs, cta_idx);
+                        sk_ops.StorePartial(kargs, cta_idx, c_block_tile);
+                        sk_ops.SignalStorePartialDone(kargs, cta_idx);
                     }
                     else
                     {
@@ -648,12 +426,12 @@ struct StreamKKernel
 
                             while(accum_iters < iter_per_tile)
                             {
-                                WaitStorePartialDone(kargs, next_cta);
+                                sk_ops.WaitStorePartialDone(kargs, next_cta);
 
                                 using BlockType = remove_cvref_t<decltype(c_block_tile)>;
-                                AddBlockTile(
+                                sk_ops.AddBlockTile(
                                     accum_block_tile,
-                                    LoadPartial<typename BlockType::DataType>(
+                                    sk_ops.template LoadPartial<typename BlockType::DataType>(
                                         kargs, next_cta, c_block_tile.get_tile_distribution()));
 
                                 accum_iters += iter_per_cta + (next_cta < extra_iters);
@@ -702,13 +480,14 @@ struct StreamKKernel
                             // partials and accumulate results.
                             if(partner_in_tile)
                             {
-                                WaitStorePartialDone(kargs, partner_cta_idx);
+                                sk_ops.WaitStorePartialDone(kargs, partner_cta_idx);
                                 using BlockType = remove_cvref_t<decltype(c_block_tile)>;
-                                AddBlockTile(accum_block_tile,
-                                             LoadPartial<typename BlockType::DataType>(
-                                                 kargs,
-                                                 partner_cta_idx,
-                                                 c_block_tile.get_tile_distribution()));
+                                sk_ops.AddBlockTile(
+                                    accum_block_tile,
+                                    sk_ops.template LoadPartial<typename BlockType::DataType>(
+                                        kargs,
+                                        partner_cta_idx,
+                                        c_block_tile.get_tile_distribution()));
                             }
                         }
                         // Otherwise, it's this workgroup's turn to write to partials. All
@@ -716,8 +495,8 @@ struct StreamKKernel
                         // partials.
                         else
                         {
-                            StorePartial(kargs, cta_idx, accum_block_tile);
-                            SignalStorePartialDone(kargs, cta_idx);
+                            sk_ops.StorePartial(kargs, cta_idx, accum_block_tile);
+                            sk_ops.SignalStorePartialDone(kargs, cta_idx);
                             // Once the workgroup writes to partials, it has no more work to do for
                             // this tile.
                             break;
@@ -738,66 +517,26 @@ struct StreamKKernel
     }
 
     /**
-     * @brief Entry point for the Stream-K Kernel with non-persistent DP.
+     * @brief Entry point for the Stream-K kernel.
      *
      * @par Overview
-     *     For the Non-Persistent kernel, each data parallel workgroup will
-     *     compute the results for their assigned macro-tile by calling `BaseGemm()`.
-     *     The Stream-K workgroups will do their assigned work by calling
-     *     `StreamKGemm()`, which calls `BaseGemm()` in the Stream-K loop.
+     *     Uses StreamKDispatch to handle both persistent and non-persistent DP sections.
+     *     Non-persistent: dedicated DP workgroups process full tiles, then dedicated SK
+     *     workgroups share remaining K-iterations.
+     *     Persistent: each workgroup loops over DP tiles (round-robin), then proceeds
+     *     to SK work.
      */
-    template <bool U = PersistentDP>
-    CK_TILE_DEVICE typename std::enable_if_t<!U> operator()(StreamKKernelArgs kargs) const
+    CK_TILE_DEVICE void operator()(StreamKKernelArgs kargs) const
     {
-        // Allocate LDS
         __shared__ char smem_ptr_0[UniversalGemmKernel::GetSmemSize()];
+        const index_t dp_num_loop = kargs.tile_partitioner.get_iters_per_tile();
 
-        index_t block_idx   = ck_tile::get_block_1d_id();
-        index_t dp_num_loop = kargs.tile_partitioner.get_iters_per_tile();
-        index_t dp_ctas     = kargs.tile_partitioner.get_dp_ctas();
-        bool is_dp_ctas     = block_idx < kargs.tile_partitioner.get_dp_ctas();
-
-        // Check if at the data parallel section
-        if(is_dp_ctas)
-        {
-            BaseGemm(kargs, block_idx, dp_num_loop, 0, 0, kargs.K, smem_ptr_0);
-        }
-        else
-        {
-            // Stream-K
-            StreamKGemm(kargs, block_idx - dp_ctas, smem_ptr_0);
-        }
-    }
-
-    /**
-     * @brief Entry point for the Stream-K Kernel with persistent DP.
-     *
-     * @par Overview
-     *     For the Persistent kernel, each workgroup will first compute their
-     *     assigned data-parallel tiles. Each data parallel tile will be computed
-     *     by calling `BaseGemm()`. Then the workgroups will proceed with the
-     *     Stream-K portion by calling `StreamKGemm()`, which calls `BaseGemm()`
-     *     in the Stream-K loop.
-     */
-    template <bool U = PersistentDP>
-    CK_TILE_DEVICE typename std::enable_if_t<U> operator()(StreamKKernelArgs kargs) const
-    {
-        // Allocate LDS
-        __shared__ char smem_ptr_0[UniversalGemmKernel::GetSmemSize()];
-
-        index_t block_idx   = ck_tile::get_block_1d_id();
-        index_t dp_num_loop = kargs.tile_partitioner.get_iters_per_tile();
-
-        // Data-parallel section
-        for(index_t tile_idx = block_idx; tile_idx < kargs.tile_partitioner.get_dp_tiles();
-            tile_idx += kargs.tile_partitioner.get_grid())
-        {
-            BaseGemm(kargs, tile_idx, dp_num_loop, 0, 0, kargs.K, smem_ptr_0);
-            block_sync_lds();
-        }
-
-        // Stream-K section
-        StreamKGemm(kargs, block_idx, smem_ptr_0);
+        StreamKDispatch(
+            kargs.tile_partitioner,
+            [&](index_t tile_idx) {
+                BaseGemm(kargs, tile_idx, dp_num_loop, 0, 0, kargs.K, smem_ptr_0);
+            },
+            [&](index_t sk_cta_idx) { StreamKGemm(kargs, sk_cta_idx, smem_ptr_0); });
     }
 
     private:
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp
index f028ba0c62..15311f4eec 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp
@@ -31,7 +31,7 @@ struct StreamKTilePartitionerBase
                                                 ? memory_operation_enum::atomic_add
                                                 : memory_operation_enum::set;
 
-    StreamKTilePartitionerBase(index_t m, index_t n, index_t k, index_t grid);
+    StreamKTilePartitionerBase(index_t m, index_t n, index_t k, index_t max_number_wgs);
 
     /**
      * @brief Calculates the total space needed for the partials buffer.
@@ -156,7 +156,7 @@ struct StreamKTilePartitionerBase
      * @brief Returns the maximum number of active workgroups; this is assumed to be number of CUs *
      * occupancy.
      */
-    CK_TILE_HOST_DEVICE index_t get_grid() const noexcept;
+    CK_TILE_HOST_DEVICE index_t get_max_active_wgs() const noexcept;
 
     /**
      * @brief Returns the number of tiles in the C tensor that will use the data-parallel (DP)
@@ -215,7 +215,7 @@ struct StreamKTilePartitionerBase
 
     protected:
     index_t num_tiles_;
-    index_t grid_;
+    index_t max_active_wgs_;
     index_t dp_tiles_;
 
     private:
@@ -270,7 +270,7 @@ struct StreamKTilePartitioner<BlockGemmShapeType, ReductionStrategyType, true>
     StreamKTilePartitioner(ck_tile::index_t m,
                            ck_tile::index_t n,
                            ck_tile::index_t k,
-                           ck_tile::index_t grid);
+                           ck_tile::index_t max_active_wgs);
 
     public:
     static constexpr bool PERSISTENT = true;
@@ -290,7 +290,7 @@ struct StreamKTilePartitioner<BlockGemmShapeType, ReductionStrategyType, true>
 
     /**
      * @brief Returns the total number of DP tiles left over when `dp_tiles_` is not evenly
-     * divisible by `grid_`.
+     * divisible by `max_active_wgs_`.
      */
     CK_TILE_HOST_DEVICE index_t get_extra_dp_tiles() const noexcept;
 
@@ -317,7 +317,7 @@ struct StreamKTilePartitioner<BlockGemmShapeType, ReductionStrategyType, false>
     StreamKTilePartitioner(ck_tile::index_t m,
                            ck_tile::index_t n,
                            ck_tile::index_t k,
-                           ck_tile::index_t grid);
+                           ck_tile::index_t max_number_wgs);
 
     public:
     static constexpr bool PERSISTENT = false;
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp
index 52cfea5872..229eefc1db 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner_impl.hpp
@@ -7,24 +7,25 @@ namespace ck_tile {
 
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
 StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::StreamKTilePartitionerBase(
-    index_t m, index_t n, index_t k, index_t grid)
-    : grid_{grid}, n_{n}
+    index_t m, index_t n, index_t k, index_t max_active_wgs)
+    : max_active_wgs_{max_active_wgs}, n_{n}
 {
     iters_per_tile_ = integer_divide_ceil(k, KPerBlock);
     num_tiles_      = integer_divide_ceil(m, MPerBlock) * integer_divide_ceil(n_, NPerBlock);
 
-    bool big_enough         = num_tiles_ > grid_;
-    index_t remainder_tiles = num_tiles_ % grid_;
+    bool big_enough         = num_tiles_ > max_active_wgs_;
+    index_t remainder_tiles = num_tiles_ % max_active_wgs_;
 
     if(remainder_tiles)
     {
-        sk_tiles_       = big_enough ? full_tiles_ * grid_ + (num_tiles_ % grid_) : num_tiles_;
-        sk_tiles_       = min(num_tiles_, sk_tiles_);
-        sk_ctas_        = grid_;
+        sk_tiles_ = big_enough ? full_tiles_ * max_active_wgs_ + (num_tiles_ % max_active_wgs_)
+                               : num_tiles_;
+        sk_tiles_ = min(num_tiles_, sk_tiles_);
+        sk_ctas_  = max_active_wgs_;
         total_sk_iters_ = sk_tiles_ * iters_per_tile_;
 
         // If there still isn't enough work to saturate all CUs, then just revert to DP only.
-        if(total_sk_iters_ < grid_)
+        if(total_sk_iters_ < max_active_wgs_)
         {
             sk_tiles_       = 0;
             sk_ctas_        = 0;
@@ -175,9 +176,10 @@ StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_num_t
 
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
 CK_TILE_HOST_DEVICE index_t
-StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_grid() const noexcept
+StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_max_active_wgs()
+    const noexcept
 {
-    return grid_;
+    return max_active_wgs_;
 }
 
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
@@ -287,11 +289,11 @@ struct StreamKTilePartitioner;
 // child class for Persistent Tile Partitioner
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
 StreamKTilePartitioner<BlockGemmShapeType, ReductionStrategyType, true>::StreamKTilePartitioner(
-    ck_tile::index_t m, ck_tile::index_t n, ck_tile::index_t k, ck_tile::index_t grid)
-    : StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>(m, n, k, grid)
+    ck_tile::index_t m, ck_tile::index_t n, ck_tile::index_t k, ck_tile::index_t max_active_wgs)
+    : StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>(m, n, k, max_active_wgs)
 { // inherit from base constructor
-    dp_tiles_per_cta_ = this->dp_tiles_ / this->grid_;
-    extra_dp_tiles_   = this->dp_tiles_ % this->grid_;
+    dp_tiles_per_cta_ = this->dp_tiles_ / this->max_active_wgs_;
+    extra_dp_tiles_   = this->dp_tiles_ % this->max_active_wgs_;
 }
 
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
@@ -301,7 +303,7 @@ StreamKTilePartitioner<BlockGemmShapeType, ReductionStrategyType, true>::grid_si
 {
     if(extra_dp_tiles_ == 0)
     {
-        return dim3(this->grid_, 1, 1);
+        return dim3(this->max_active_wgs_, 1, 1);
     }
     else
     {
@@ -328,8 +330,8 @@ StreamKTilePartitioner<BlockGemmShapeType, ReductionStrategyType, true>::get_ext
 // child class for Non-Persistent Tile Partitioner
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
 StreamKTilePartitioner<BlockGemmShapeType, ReductionStrategyType, false>::StreamKTilePartitioner(
-    ck_tile::index_t m, ck_tile::index_t n, ck_tile::index_t k, ck_tile::index_t grid)
-    : StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>(m, n, k, grid)
+    ck_tile::index_t m, ck_tile::index_t n, ck_tile::index_t k, ck_tile::index_t max_active_wgs)
+    : StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>(m, n, k, max_active_wgs)
 { // inherit from base constructor
     dp_ctas_            = this->dp_tiles_;
     dp_start_block_idx_ = 0;
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
index 8a3bbc425a..d5ba324326 100644
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -418,7 +418,8 @@ struct UniversalGemmKernel
             }
         }
 
-        if(kargs.K < GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)
+        if(integer_divide_ceil(kargs.K, GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{})) <
+           kargs.k_batch)
         {
             if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
             {
@@ -447,23 +448,11 @@ struct UniversalGemmKernel
                 }
                 if(kargs.K % vectorSizeA != 0)
                 {
-                    const auto remainder = kargs.K % vectorSizeA;
-                    constexpr ck_tile::index_t APackedSize =
-                        ck_tile::numeric_traits<ADataType>::PackedSize;
-                    const auto remainder_in_bytes = remainder * sizeof(ADataType) / APackedSize;
-                    // oob can support to dword level
-                    if(remainder_in_bytes % 4 == 0)
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                     {
-                        AsTensorIsValid = true;
-                    }
-                    else
-                    {
-                        if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                        {
-                            CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
-                        }
-                        AsTensorIsValid = false;
+                        CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
                     }
+                    AsTensorIsValid = false;
                 }
             }
             else
@@ -479,24 +468,11 @@ struct UniversalGemmKernel
                 }
                 if(kargs.M % vectorSizeA != 0)
                 {
-                    const auto remainder = kargs.M % vectorSizeA;
-                    constexpr ck_tile::index_t APackedSize =
-                        ck_tile::numeric_traits<ADataType>::PackedSize;
-                    const auto remainder_in_bytes = remainder * sizeof(ADataType) / APackedSize;
-                    // oob can support to dword level
-                    if(remainder_in_bytes % 4 == 0)
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                     {
-
-                        AsTensorIsValid = true;
-                    }
-                    else
-                    {
-                        if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                        {
-                            CK_TILE_ERROR("M is not a multiple of vector load size for A tensor!");
-                        }
-                        AsTensorIsValid = false;
+                        CK_TILE_ERROR("M is not a multiple of vector load size for A tensor!");
                     }
+                    AsTensorIsValid = false;
                 }
             }
         });
@@ -519,58 +495,33 @@ struct UniversalGemmKernel
                 }
                 if(kargs.N % vectorSizeB != 0)
                 {
-                    const auto remainder = kargs.N % vectorSizeB;
-                    constexpr ck_tile::index_t BPackedSize =
-                        ck_tile::numeric_traits<BDataType>::PackedSize;
-                    const auto remainder_in_bytes = remainder * sizeof(BDataType) / BPackedSize;
-                    // oob can support to dword level
-                    if(remainder_in_bytes % 4 == 0)
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                     {
-                        BsTensorIsValid = true;
-                    }
-                    else
-                    {
-                        if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                        {
-                            CK_TILE_ERROR("N is not a multiple of vector load size for B tensor!");
-                        }
-                        BsTensorIsValid = false;
+                        CK_TILE_ERROR("N is not a multiple of vector load size for B tensor!");
                     }
+                    BsTensorIsValid = false;
                 }
-                else
+            }
+            else
+            {
+                if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
+                   GemmPipeline::kPadK == false)
                 {
-                    if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
-                       GemmPipeline::kPadK == false)
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                     {
-                        if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                        {
-                            CK_TILE_ERROR(
-                                "Can't support K that is not a multiple of k_batch * KPerBlock "
-                                "without padding!");
-                        }
-                        BsTensorIsValid = false;
+                        CK_TILE_ERROR(
+                            "Can't support K that is not a multiple of k_batch * KPerBlock "
+                            "without padding!");
                     }
-                    if(kargs.K % vectorSizeB != 0)
+                    BsTensorIsValid = false;
+                }
+                if(kargs.K % vectorSizeB != 0)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                     {
-                        const auto remainder = kargs.K % vectorSizeB;
-                        constexpr ck_tile::index_t BPackedSize =
-                            ck_tile::numeric_traits<BDataType>::PackedSize;
-                        const auto remainder_in_bytes = remainder * sizeof(BDataType) / BPackedSize;
-                        // oob can support to dword level
-                        if(remainder_in_bytes % 4 == 0)
-                        {
-                            BsTensorIsValid = true;
-                        }
-                        else
-                        {
-                            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                            {
-                                CK_TILE_ERROR(
-                                    "K is not a multiple of vector load size for B tensor!");
-                            }
-                            BsTensorIsValid = false;
-                        }
+                        CK_TILE_ERROR("K is not a multiple of vector load size for B tensor!");
                     }
+                    BsTensorIsValid = false;
                 }
             }
         });
@@ -1070,6 +1021,11 @@ struct UniversalGemmKernel
         const auto& e_tensor_view =
             make_tensor_view<address_space_enum::global, DstInMemOp>(e_ptr, e_desc);
 
+        // For bf16_t and atomic_add global_atomic_add is used instead of buffer_atomic_add
+        // Add padding for not contiguous dim due to the lack of OOB check
+        constexpr bool pad_not_contiguous_dim =
+            std::is_same_v<EDataType, bf16_t> && DstInMemOp == memory_operation_enum::atomic_add;
+
         // Step 2: Create padded view
         const auto& e_pad_view = [&]() {
             if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
@@ -1077,14 +1033,14 @@ struct UniversalGemmKernel
                 return pad_tensor_view(e_tensor_view,
                                        make_tuple(number<TilePartitioner::MPerBlock>{},
                                                   number<TilePartitioner::NPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadN>{});
+                                       sequence<pad_not_contiguous_dim, GemmPipeline::kPadN>{});
             }
             else
             {
                 return pad_tensor_view(e_tensor_view,
                                        make_tuple(number<TilePartitioner::MPerBlock>{},
                                                   number<TilePartitioner::NPerBlock>{}),
-                                       sequence<GemmPipeline::kPadM, false>{});
+                                       sequence<GemmPipeline::kPadM, pad_not_contiguous_dim>{});
             }
         }();
 
@@ -1270,23 +1226,37 @@ struct UniversalGemmKernel
             s_waitcnt_barrier();
             const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles);
             const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
-            const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
-            const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
+            // Apply pivot to M tile index first, then use the same pivoted index
+            // for both data-tile selection and chunk-signal wait.
+            auto iM_eff = amd_wave_read_first_lane(iM);
+
+            if(kargs.async_input_scheduler.chunk_signals != nullptr)
+            {
+                const auto tile_idx_pivot =
+                    amd_wave_read_first_lane(kargs.async_input_scheduler.tile_idx_pivot_m);
+                const auto tiles_m = amd_wave_read_first_lane(
+                    integer_divide_ceil(kargs.M, TilePartitioner::MPerBlock));
+                if(tiles_m > 0)
+                {
+                    iM_eff = amd_wave_read_first_lane((iM_eff + tile_idx_pivot) % tiles_m);
+                }
+            }
+
+            const index_t i_m = amd_wave_read_first_lane(iM_eff * TilePartitioner::MPerBlock);
+            const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
             // Synchronize with producer to ensure input data is ready before processing tile
             if(kargs.async_input_scheduler.chunk_signals != nullptr)
             {
                 const auto tiles_per_chunk =
                     amd_wave_read_first_lane(kargs.async_input_scheduler.tiles_per_chunk_m);
-                const auto tile_idx_pivot =
-                    amd_wave_read_first_lane(kargs.async_input_scheduler.tile_idx_pivot_m);
                 const auto num_chunks =
                     amd_wave_read_first_lane(kargs.async_input_scheduler.num_chunks);
                 if(tiles_per_chunk > 0 && num_chunks > 0)
                 {
                     // Pivot allows rotating chunk assignments for load balancing
-                    const auto chunk_idx = amd_wave_read_first_lane(
-                        ((iM + tile_idx_pivot) / tiles_per_chunk) % num_chunks);
+                    const auto chunk_idx =
+                        amd_wave_read_first_lane((iM_eff / tiles_per_chunk) % num_chunks);
                     workgroup_barrier chunk_barrier(kargs.async_input_scheduler.chunk_signals);
                     chunk_barrier.wait_eq_wave(/*value=*/1, /*offset=*/chunk_idx);
                 }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
index dc56f34bc7..f43bcbc4b1 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -146,10 +146,9 @@ struct GemmPipelineAgBgCrImplBase
         auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
 
         // TODO: LDS alignment should come from Policy!
-        constexpr index_t APackedSize            = numeric_traits<OverrideADataType>::PackedSize;
-        constexpr index_t a_lds_block_space_size = lds_padded_sizeof<OverrideADataType>() *
-                                                   a_lds_block_desc.get_element_space_size() /
-                                                   APackedSize;
+        constexpr index_t APackedSize = numeric_traits<OverrideADataType>::PackedSize;
+        constexpr index_t a_lds_block_space_size =
+            sizeof(OverrideADataType) * a_lds_block_desc.get_element_space_size() / APackedSize;
         constexpr index_t a_lds_block_space_size_aligned =
             integer_least_multiple(a_lds_block_space_size, 16);
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index ddd76c4dfd..f45b0ffb2c 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -837,10 +837,9 @@ struct UniversalGemmBasePolicy
         using BlockGemm = remove_cvref_t<decltype(Derived::template GetBlockGemm<Problem>())>;
 
         constexpr index_t KPack    = static_cast<index_t>(BlockGemm::Traits::KPack);
-        constexpr index_t VecElems = static_cast<index_t>(Problem::VectorLoadSize / sizeof(A)) *
-                                     numeric_traits<A>::PackedSize;
+        constexpr index_t VecElems = static_cast<index_t>(Problem::VectorLoadSize / sizeof(A));
 
-        return ck_tile::min(KPack, VecElems);
+        return (KPack < VecElems) ? KPack : VecElems;
     }
 
     template <typename Problem>
@@ -850,10 +849,9 @@ struct UniversalGemmBasePolicy
         using BlockGemm = remove_cvref_t<decltype(Derived::template GetBlockGemm<Problem>())>;
 
         constexpr index_t KPack    = static_cast<index_t>(BlockGemm::Traits::KPack);
-        constexpr index_t VecElems = static_cast<index_t>(Problem::VectorLoadSize / sizeof(B)) *
-                                     numeric_traits<B>::PackedSize;
+        constexpr index_t VecElems = static_cast<index_t>(Problem::VectorLoadSize / sizeof(B));
 
-        return ck_tile::min(KPack, VecElems);
+        return (KPack < VecElems) ? KPack : VecElems;
     }
 
     template <typename Problem>
@@ -862,10 +860,8 @@ struct UniversalGemmBasePolicy
         using ADataType                 = remove_cvref_t<typename Problem::ADataType>;
         constexpr auto APackedSize      = numeric_traits<ADataType>::PackedSize;
         constexpr auto a_lds_block_desc = Derived::template MakeALdsBlockDescriptor<Problem>();
-        constexpr index_t smem_size_a =
-            integer_least_multiple(a_lds_block_desc.get_element_space_size() *
-                                       lds_padded_sizeof<ADataType>() / APackedSize,
-                                   16);
+        constexpr index_t smem_size_a   = integer_least_multiple(
+            a_lds_block_desc.get_element_space_size() * sizeof(ADataType) / APackedSize, 16);
         return smem_size_a;
     }
 
@@ -878,10 +874,8 @@ struct UniversalGemmBasePolicy
                                                                         typename Problem::BDataType>;
         constexpr auto BPackedSize                 = numeric_traits<BDataType>::PackedSize;
         constexpr auto b_lds_block_desc = Derived::template MakeBLdsBlockDescriptor<Problem>();
-        constexpr index_t smem_size_b =
-            integer_least_multiple(b_lds_block_desc.get_element_space_size() *
-                                       lds_padded_sizeof<BDataType>() / BPackedSize,
-                                   16);
+        constexpr index_t smem_size_b   = integer_least_multiple(
+            b_lds_block_desc.get_element_space_size() * sizeof(BDataType) / BPackedSize, 16);
         return smem_size_b;
     }
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index e37af2ef5f..c2ddaa2730 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -369,6 +369,13 @@ using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
 using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl<
     WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_32x32x32_fp8_fp8_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>,
+        2,
+        AttrNumAccess>>;
+
 using WarpGemmMfma_f32_32x32x32_fp8_bf8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>,
     2>>;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 94e0494aac..f59bd61db7 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -170,6 +170,8 @@ template<WGAttrNumAccessEnum I> struct Dispatcher<pk_fp4_t, pk_fp4_t, float, 32,
 
 template<> struct Dispatcher<fp8_t, fp8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8<>; };
 template<> struct Dispatcher<fp8_t, fp8_t, float, 32, 32,  32, false, false, false, EDouble> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8<EDouble>; };
+template<> struct Dispatcher<fp8_t, fp8_t, float, 32, 32,  32, true, false, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8_CTransposed<>; };
+template<> struct Dispatcher<fp8_t, fp8_t, float, 32, 32,  32, true, false, false, EDouble> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8_CTransposed<EDouble>; };
 template<> struct Dispatcher<bf8_t, bf8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_bf8_bf8<>; };
 template<> struct Dispatcher<bf8_t, bf8_t, float, 32, 32,  32, false, false, false, EDouble> { using Type = WarpGemmMfma_f32_32x32x32_bf8_bf8<EDouble>; };
 
diff --git a/include/ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp b/include/ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp
index bf00bc0b0f..ec16a4e8b6 100644
--- a/include/ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp
+++ b/include/ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp
@@ -442,12 +442,10 @@ struct MXGemmPipelineAgBgCrCompAsync : public BaseMXGemmPipelineAgBgCrCompAsync<
                                                   MWarp / BlockSize,
                           "BLdsTile size is wrong!");
             static_assert(Policy::template GetSmemSizeA<Problem>() ==
-                              MPerBlock *
-                                  (KPerBlock * lds_padded_sizeof<ADataType>() / APackedSize),
+                              MPerBlock * (KPerBlock * sizeof(ADataType) / APackedSize),
                           "SmemSizeA size is wrong!");
             static_assert(Policy::template GetSmemSizeB<Problem>() ==
-                              (KPerBlock * lds_padded_sizeof<BDataType>() / BPackedSize) *
-                                  NPerBlock,
+                              (KPerBlock * sizeof(BDataType) / BPackedSize) * NPerBlock,
                           "SmemSizeB size is wrong!");
 
             ////////////// MX Scale register tiles (ping-pong buffers) /////////////////
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_aquant_flatbr_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_aquant_flatbr_bquant_cr.hpp
index a068001482..94fabe6f65 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_aquant_flatbr_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_aquant_flatbr_bquant_cr.hpp
@@ -210,45 +210,45 @@ struct BlockGemmWeightPreshuffleABQuantARegBRegCReg : public BlockGemmQuantBase
             c_acc;
 
         auto zero_accumulators = [&] {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    static_for<0, (WG::kM * WG::kN) / warp_size, 1>{}([&](auto i) {
-                        c_acc(mIter)(nIter).get_thread_buffer()[i] = 0.0f;
-                    }); // make sure WG::CWarpTensor exposes a clear/zero
+            static_ford<sequence<MIterPerWarp, NIterPerWarp, (WG::kM * WG::kN) / warp_size>>{}(
+                [&](auto mni) {
+                    constexpr auto mIter                       = number<mni[number<0>{}]>{};
+                    constexpr auto nIter                       = number<mni[number<1>{}]>{};
+                    constexpr auto i                           = number<mni[number<2>{}]>{};
+                    c_acc(mIter)(nIter).get_thread_buffer()[i] = 0.0f;
                 });
-            });
         };
         static_for<0, QScalesPerBlockRow, 1>{}([&](auto kQScale) {
             zero_accumulators();
-            static_for<0, KIterPerQScale, 1>{}([&](auto kIterInQScale) {
-                constexpr auto kIter = kQScale * KIterPerQScale + kIterInQScale;
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // warp GEMM
-                        WG{}(c_acc(mIter)(nIter),
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor(nIter)(number<kIter>{}));
-                    });
-                    __builtin_amdgcn_sched_barrier(0x7F6);
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-
-                        load_and_convert_tile<UnaryOpSize>(
-                            a_warp_tensor(number<AwarpIter>{}),
-                            a_warp_windows(number<AmIter>{})(number<AkIter>{}));
-                    }
-                    // barrier
-                    // Could be deleted
-                    if constexpr((mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
+            static_ford<sequence<KIterPerQScale, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIterInQScale = number<km[number<0>{}]>{};
+                constexpr auto mIter         = number<km[number<1>{}]>{};
+                constexpr auto kIter         = kQScale * KIterPerQScale + kIterInQScale;
+                constexpr auto AwarpIter     = (kIter * MIterPerWarp + mIter) % m_preload;
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // warp GEMM
+                    WG{}(c_acc(mIter)(nIter),
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor(nIter)(number<kIter>{}));
                 });
+                __builtin_amdgcn_sched_barrier(0x7F6);
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+
+                    load_and_convert_tile<UnaryOpSize>(
+                        a_warp_tensor(number<AwarpIter>{}),
+                        a_warp_windows(number<AmIter>{})(number<AkIter>{}));
+                }
+                // barrier
+                // Could be deleted
+                if constexpr((mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
             });
             static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                 AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(aq_block_tensor);
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
index d2cfaca7b7..1ee3b227b7 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
@@ -127,105 +127,103 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
             c_acc;
 
         auto zero_accumulators = [&] {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    static_for<0, (WG::kM * WG::kN) / warp_size, 1>{}([&](auto i) {
-                        c_acc(mIter)(nIter).get_thread_buffer()[i] = 0.0f;
-                    }); // make sure WG::CWarpTensor exposes a clear/zero
+            static_ford<sequence<MIterPerWarp, NIterPerWarp, (WG::kM * WG::kN) / warp_size>>{}(
+                [&](auto mni) {
+                    constexpr auto mIter                       = number<mni[number<0>{}]>{};
+                    constexpr auto nIter                       = number<mni[number<1>{}]>{};
+                    constexpr auto i                           = number<mni[number<2>{}]>{};
+                    c_acc(mIter)(nIter).get_thread_buffer()[i] = 0.0f;
                 });
-            });
         };
         static_for<0, QScalesPerBlockRow, 1>{}([&](auto kQScale) {
             zero_accumulators();
-            static_for<0, KIterPerQScale, 1>{}([&](auto kIterInQScale) {
-                constexpr auto kIter = kQScale * KIterPerQScale + kIterInQScale;
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // warp GEMM
-                        WG{}(c_acc(mIter)(nIter),
-                             a_warp_tensor(number<AwarpIter>{}),
-                             b_warp_tensor(nIter)(number<kIter>{}));
-                    });
-                    __builtin_amdgcn_sched_barrier(0x7F6);
-                    // preload next A from lds
-                    if constexpr((kIter * MIterPerWarp + mIter) <
-                                 (KIterPerWarp * MIterPerWarp - m_preload))
-                    {
-                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                        a_warp_tensor(number<AwarpIter>{}) =
-                            load_tile(a_warp_windows(number<AmIter>{})(number<AkIter>{}));
-                    }
-                    // barrier
-                    // Could be deleted
-                    if constexpr((mIter == MIter_2nd_last))
-                    {
-                        block_sync_lds();
-                    }
-                });
-            });
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_ford<sequence<KIterPerQScale, MIterPerWarp>>{}([&](auto km) {
+                constexpr auto kIterInQScale = number<km[number<0>{}]>{};
+                constexpr auto mIter         = number<km[number<1>{}]>{};
+                constexpr auto kIter         = kQScale * KIterPerQScale + kIterInQScale;
+                constexpr auto AwarpIter     = (kIter * MIterPerWarp + mIter) % m_preload;
                 static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    constexpr auto tbuf_offset =
-                        number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
-                                   merge_sequences(sequence<mIter, nIter>{},
-                                                   c_warp_y_index_zeros)) /
-                               CBlockTensor::PackedSize>{};
+                    // warp GEMM
+                    WG{}(c_acc(mIter)(nIter),
+                         a_warp_tensor(number<AwarpIter>{}),
+                         b_warp_tensor(nIter)(number<kIter>{}));
+                });
+                __builtin_amdgcn_sched_barrier(0x7F6);
+                // preload next A from lds
+                if constexpr((kIter * MIterPerWarp + mIter) <
+                             (KIterPerWarp * MIterPerWarp - m_preload))
+                {
+                    constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                    constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                    a_warp_tensor(number<AwarpIter>{}) =
+                        load_tile(a_warp_windows(number<AmIter>{})(number<AkIter>{}));
+                }
+                // barrier
+                // Could be deleted
+                if constexpr((mIter == MIter_2nd_last))
+                {
+                    block_sync_lds();
+                }
+            });
+            static_ford<sequence<MIterPerWarp, NIterPerWarp>>{}([&](auto mn) {
+                constexpr auto mIter = number<mn[number<0>{}]>{};
+                constexpr auto nIter = number<mn[number<1>{}]>{};
+                constexpr auto tbuf_offset =
+                    number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                               merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros)) /
+                           CBlockTensor::PackedSize>{};
 
-                    if constexpr(BPreshuffleQuant)
+                if constexpr(BPreshuffleQuant)
+                {
+                    constexpr index_t reg_offset = nIter;
+                    auto pull_from_lane = (__lane_id() & (WG::kN - 1)) * KPerBlockBQ + kQScale;
+                    auto& scale_reg     = bq_block_tensor.get_thread_buffer()[reg_offset];
+                    // cross lane ops
+                    uint32_t scale_reg_dword;
+
+                    if constexpr(std::is_same_v<BQDataType, float>)
                     {
-                        constexpr index_t reg_offset = nIter;
-                        auto pull_from_lane = (__lane_id() & (WG::kN - 1)) * KPerBlockBQ + kQScale;
-                        auto& scale_reg     = bq_block_tensor.get_thread_buffer()[reg_offset];
-                        // cross lane ops
-                        uint32_t scale_reg_dword;
-
-                        if constexpr(std::is_same_v<BQDataType, float>)
-                        {
-                            scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
-                        }
-                        else
-                        {
-                            scale_reg_dword = static_cast<uint32_t>(scale_reg);
-                        }
-
-                        // cross lane ops to get the value of scale_reg.
-                        int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
-                            pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
-
-                        float scale_reg_f = cvt_scale_to_fp32(gathered_scale_reg);
-
-                        static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
-                            auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
-                            const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
-                            c_ref              = c_ref + acc_val * scale_reg_f;
-                        });
+                        scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
                     }
                     else
                     {
-                        index_t reg_offset = [&]() {
-                            if constexpr(BQuantGroupSize::kN >= (NWarp * WG::kN))
-                            {
-                                return (nIter * NWarp * WG::kN) / BQuantGroupSize::kN *
-                                           KPerBlockBQ +
-                                       kQScale;
-                            }
-                            else
-                            {
-                                return nIter * KPerBlockBQ + kQScale;
-                            }
-                        }();
-                        auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
-                        float scale_reg_f = cvt_scale_to_fp32(scale_reg);
-
-                        static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
-                            auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
-                            const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
-                            c_ref              = c_ref + acc_val * scale_reg_f;
-                        });
+                        scale_reg_dword = static_cast<uint32_t>(scale_reg);
                     }
-                });
+
+                    // cross lane ops to get the value of scale_reg.
+                    int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                        pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
+
+                    float scale_reg_f = cvt_scale_to_fp32(gathered_scale_reg);
+
+                    static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
+                        auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
+                        const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
+                        c_ref              = c_ref + acc_val * scale_reg_f;
+                    });
+                }
+                else
+                {
+                    index_t reg_offset = [&]() {
+                        if constexpr(BQuantGroupSize::kN >= (NWarp * WG::kN))
+                        {
+                            return (nIter * NWarp * WG::kN) / BQuantGroupSize::kN * KPerBlockBQ +
+                                   kQScale;
+                        }
+                        else
+                        {
+                            return nIter * KPerBlockBQ + kQScale;
+                        }
+                    }();
+                    auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
+                    float scale_reg_f = cvt_scale_to_fp32(scale_reg);
+
+                    static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
+                        auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
+                        const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
+                        c_ref              = c_ref + acc_val * scale_reg_f;
+                    });
+                }
             });
         });
     }
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp
index 24d9f9a1e5..cc65d213f1 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp
@@ -290,121 +290,115 @@ struct ABQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase
             constexpr auto warp_size = get_warp_size();
 
             // hot loop:
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    CWarpTensor c_warp_tensor;
+            static_ford<sequence<MIterPerWarp, NIterPerWarp>>{}([&](auto mn) {
+                constexpr auto mIter = number<mn[number<0>{}]>{};
+                constexpr auto nIter = number<mn[number<1>{}]>{};
+                CWarpTensor c_warp_tensor;
 
-                    static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
-                        static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
-                            constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
+                static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
+                    static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
+                        constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
 
-                            AWarpTensor a_warp_tensor;
-                            a_warp_tensor.get_thread_buffer() =
-                                a_warp_tile_.get_y_sliced_thread_data(
-                                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                        AWarpTensor a_warp_tensor;
+                        a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                            BWarpTensor b_warp_tensor;
-                            b_warp_tensor.get_thread_buffer() =
-                                b_warp_tile_.get_y_sliced_thread_data(
-                                    merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                        BWarpTensor b_warp_tensor;
+                        b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                            if constexpr(kIterInQScale == 0)
-                            {
-                                c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
-                            }
-                            else
-                            {
-                                WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
-                            }
-                        });
-
-                        constexpr auto tbuf_offset =
-                            number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
-                                       merge_sequences(sequence<mIter, nIter>{},
-                                                       c_warp_y_index_zeros)) /
-                                   CBlockTensor::PackedSize>{};
-                        // a_scale
-                        AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(
-                            aq_block_tensor);
-
-                        if constexpr(BPreshuffleQuant)
+                        if constexpr(kIterInQScale == 0)
                         {
-                            constexpr index_t reg_offset = [&]() {
-                                if constexpr(GemmTraits::BQuantGroupSize::kN >
-                                                 (NWarp * WarpGemm::kN) &&
-                                             Traits::NPerBlock == GemmTraits::BQuantGroupSize::kN)
-                                {
-                                    return kQScale;
-                                }
-                                else
-                                {
-                                    return nIter;
-                                }
-                            }();
-
-                            auto pull_from_lane =
-                                (__lane_id() & (WarpGemm::kN - 1)) * Traits::KQPerBlock + kQScale;
-
-                            auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
-                            // cross lane ops
-                            uint32_t scale_reg_dword;
-
-                            if constexpr(std::is_same_v<BQDataType, float>)
-                            {
-                                scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
-                            }
-                            else
-                            {
-                                scale_reg_dword = static_cast<uint32_t>(scale_reg);
-                            }
-
-                            // cross lane ops to get the value of scale_reg.
-                            int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
-                                pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
-
-                            float b_scale_reg_f =
-                                Base::cvt_scale_to_fp32<typename Traits::BQDataType>(
-                                    gathered_scale_reg);
-
-                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
-                                [&](auto c_row) {
-                                    float a_scale_reg_f = aq_picker.template pick<c_row>();
-                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
-                                        (c_warp_tensor.get_thread_buffer()[c_row] * a_scale_reg_f *
-                                         b_scale_reg_f);
-                                });
+                            c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
                         }
                         else
                         {
-                            // Multiply bquant with accumulated C
-                            constexpr index_t reg_offset = [&]() {
-                                if constexpr(GemmTraits::BQuantGroupSize::kN >=
-                                             (NWarp * WarpGemm::kN))
-                                    return (nIter * NWarp * WarpGemm::kN) /
-                                               GemmTraits::BQuantGroupSize::kN *
-                                               Traits::KQPerBlock +
-                                           kQScale;
-                                else
-                                {
-                                    return nIter * Traits::KQPerBlock + kQScale;
-                                }
-                            }();
-
-                            auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
-                            float b_scale_reg_f =
-                                Base::cvt_scale_to_fp32<typename Traits::BQDataType>(scale_reg);
-
-                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
-                                [&](auto c_row) {
-                                    float a_scale_reg_f = aq_picker.template pick<c_row>();
-                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
-                                        (c_warp_tensor.get_thread_buffer()[c_row] * a_scale_reg_f *
-                                         b_scale_reg_f);
-                                });
+                            WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
                         }
                     });
+
+                    constexpr auto tbuf_offset =
+                        number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                   merge_sequences(sequence<mIter, nIter>{},
+                                                   c_warp_y_index_zeros)) /
+                               CBlockTensor::PackedSize>{};
+                    // a_scale
+                    AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(
+                        aq_block_tensor);
+
+                    if constexpr(BPreshuffleQuant)
+                    {
+                        constexpr index_t reg_offset = [&]() {
+                            if constexpr(GemmTraits::BQuantGroupSize::kN > (NWarp * WarpGemm::kN) &&
+                                         Traits::NPerBlock == GemmTraits::BQuantGroupSize::kN)
+                            {
+                                return kQScale;
+                            }
+                            else
+                            {
+                                return nIter;
+                            }
+                        }();
+
+                        auto pull_from_lane =
+                            (__lane_id() & (WarpGemm::kN - 1)) * Traits::KQPerBlock + kQScale;
+
+                        auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
+                        // cross lane ops
+                        uint32_t scale_reg_dword;
+
+                        if constexpr(std::is_same_v<BQDataType, float>)
+                        {
+                            scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                        }
+                        else
+                        {
+                            scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                        }
+
+                        // cross lane ops to get the value of scale_reg.
+                        int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                            pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
+
+                        float b_scale_reg_f = Base::cvt_scale_to_fp32<typename Traits::BQDataType>(
+                            gathered_scale_reg);
+
+                        static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                            [&](auto c_row) {
+                                float a_scale_reg_f = aq_picker.template pick<c_row>();
+                                c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                    (c_warp_tensor.get_thread_buffer()[c_row] * a_scale_reg_f *
+                                     b_scale_reg_f);
+                            });
+                    }
+                    else
+                    {
+                        // Multiply bquant with accumulated C
+                        constexpr index_t reg_offset = [&]() {
+                            if constexpr(GemmTraits::BQuantGroupSize::kN >= (NWarp * WarpGemm::kN))
+                                return (nIter * NWarp * WarpGemm::kN) /
+                                           GemmTraits::BQuantGroupSize::kN * Traits::KQPerBlock +
+                                       kQScale;
+                            else
+                            {
+                                return nIter * Traits::KQPerBlock + kQScale;
+                            }
+                        }();
+
+                        auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
+                        float b_scale_reg_f =
+                            Base::cvt_scale_to_fp32<typename Traits::BQDataType>(scale_reg);
+
+                        static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                            [&](auto c_row) {
+                                float a_scale_reg_f = aq_picker.template pick<c_row>();
+                                c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                    (c_warp_tensor.get_thread_buffer()[c_row] * a_scale_reg_f *
+                                     b_scale_reg_f);
+                            });
+                    }
                 });
             });
         }
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
index 8b09530af1..64f8bc7df4 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -268,54 +268,51 @@ struct AQuantBlockUniversalGemmAsBsCr
             constexpr auto warp_size = get_warp_size();
 
             // hot loop:
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    CWarpTensor c_warp_tensor;
+            static_ford<sequence<MIterPerWarp, NIterPerWarp>>{}([&](auto mn) {
+                constexpr auto mIter = number<mn[number<0>{}]>{};
+                constexpr auto nIter = number<mn[number<1>{}]>{};
+                CWarpTensor c_warp_tensor;
 
-                    // for every column in AQ
-                    static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
-                        // for every warp corresponding to a quantization scale
-                        static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
-                            constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
+                // for every column in AQ
+                static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
+                    // for every warp corresponding to a quantization scale
+                    static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
+                        constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
 
-                            AWarpTensor a_warp_tensor;
-                            a_warp_tensor.get_thread_buffer() =
-                                a_warp_tile_.get_y_sliced_thread_data(
-                                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                        AWarpTensor a_warp_tensor;
+                        a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                            BWarpTensor b_warp_tensor;
-                            b_warp_tensor.get_thread_buffer() =
-                                b_warp_tile_.get_y_sliced_thread_data(
-                                    merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                        BWarpTensor b_warp_tensor;
+                        b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                            if constexpr(kIterInQScale == 0)
-                            {
-                                c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
-                            }
-                            else
-                            {
-                                WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
-                            }
-                        });
+                        if constexpr(kIterInQScale == 0)
+                        {
+                            c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
+                        }
+                        else
+                        {
+                            WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                        }
+                    });
 
-                        constexpr auto tbuf_offset =
-                            number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
-                                       merge_sequences(sequence<mIter, nIter>{},
-                                                       c_warp_y_index_zeros)) /
-                                   CBlockTensor::PackedSize>{};
+                    constexpr auto tbuf_offset =
+                        number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                   merge_sequences(sequence<mIter, nIter>{},
+                                                   c_warp_y_index_zeros)) /
+                               CBlockTensor::PackedSize>{};
 
-                        AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(
-                            aq_block_tensor);
+                    AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(
+                        aq_block_tensor);
 
-                        static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
-                            [&](auto c_row) {
-                                float scale_reg_f = aq_picker.template pick<c_row>();
+                    static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}([&](auto c_row) {
+                        float scale_reg_f = aq_picker.template pick<c_row>();
 
-                                c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
-                                    (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
-                            });
+                        c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                            (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
                     });
                 });
             });
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
index f5900fcdec..9851fc917d 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
@@ -290,57 +290,55 @@ struct BQuantBlockUniversalGemmAsBsCr
             using SrcVectorRawType = ext_vector_t<BDataTypeRaw, UnaryOpSize_ / BPackedSize>;
             using DstVectorType    = ext_vector_t<ComputeDataType, UnaryOpSize_>;
 
-            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
-                    // B scale register offset
-                    constexpr index_t reg_offset = [&]() {
-                        if constexpr(GemmTraits::BQuantGroupSize::kN >= (NWarp * WarpGemm::kN))
-                            return ((nIter * NWarp * WarpGemm::kN) /
-                                    GemmTraits::BQuantGroupSize::kN) *
-                                       Traits::KQPerBlock +
-                                   kQScale;
-                        else
-                        {
-                            return nIter * Traits::KQPerBlock + kQScale;
-                        }
-                    }();
+            static_ford<sequence<NIterPerWarp, Traits::QScalesPerBlockRow>>{}([&](auto nk) {
+                constexpr auto nIter   = number<nk[number<0>{}]>{};
+                constexpr auto kQScale = number<nk[number<1>{}]>{};
+                // B scale register offset
+                constexpr index_t reg_offset = [&]() {
+                    if constexpr(GemmTraits::BQuantGroupSize::kN >= (NWarp * WarpGemm::kN))
+                        return ((nIter * NWarp * WarpGemm::kN) / GemmTraits::BQuantGroupSize::kN) *
+                                   Traits::KQPerBlock +
+                               kQScale;
+                    else
+                    {
+                        return nIter * Traits::KQPerBlock + kQScale;
+                    }
+                }();
 
-                    // Get B scale from thread buffer
-                    auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
-                    float b_scale_f = float(scale_reg);
+                // Get B scale from thread buffer
+                auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
+                float b_scale_f = float(scale_reg);
 
-                    static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
-                        constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
-                        // Thread buffers
-                        using BWarpThreadBuffer = decltype(b_warp_tile_.get_y_sliced_thread_data(
-                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)));
-                        using BLDSThreadBuffer = decltype(b_warp_tile_lds_.get_y_sliced_thread_data(
-                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)));
+                static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
+                    constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
+                    // Thread buffers
+                    using BWarpThreadBuffer = decltype(b_warp_tile_.get_y_sliced_thread_data(
+                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)));
+                    using BLDSThreadBuffer  = decltype(b_warp_tile_lds_.get_y_sliced_thread_data(
+                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)));
 
-                        BWarpThreadBuffer b_warp_thread_buffer;
-                        BLDSThreadBuffer b_lds_thread_buffer;
+                    BWarpThreadBuffer b_warp_thread_buffer;
+                    BLDSThreadBuffer b_lds_thread_buffer;
 
-                        // Load thread buffer from tile (LDS type)
-                        b_lds_thread_buffer = b_warp_tile_lds_.get_y_sliced_thread_data(
-                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                    // Load thread buffer from tile (LDS type)
+                    b_lds_thread_buffer = b_warp_tile_lds_.get_y_sliced_thread_data(
+                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                        // Apply scale to B thread buffer and cast
-                        static_for<0, thread_buffer_size, 1>{}([&](auto i) {
-                            elementwise_op(
-                                b_warp_thread_buffer.template get_as<DstVectorType>()(i),
-                                b_lds_thread_buffer.template get_as<SrcVectorRawType>()[i],
-                                b_scale_f);
-                        });
-
-                        // Store B thread buffer to tile (MMA type)
-                        b_warp_tile_.set_y_sliced_thread_data(
-                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths),
-                            b_warp_thread_buffer);
+                    // Apply scale to B thread buffer and cast
+                    static_for<0, thread_buffer_size, 1>{}([&](auto i) {
+                        elementwise_op(b_warp_thread_buffer.template get_as<DstVectorType>()(i),
+                                       b_lds_thread_buffer.template get_as<SrcVectorRawType>()[i],
+                                       b_scale_f);
                     });
+
+                    // Store B thread buffer to tile (MMA type)
+                    b_warp_tile_.set_y_sliced_thread_data(
+                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths),
+                        b_warp_thread_buffer);
                 });
             });
         }
@@ -361,113 +359,107 @@ struct BQuantBlockUniversalGemmAsBsCr
             constexpr auto warp_size = get_warp_size();
 
             // hot loop:
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    CWarpTensor c_warp_tensor;
+            static_ford<sequence<MIterPerWarp, NIterPerWarp>>{}([&](auto mn) {
+                constexpr auto mIter = number<mn[number<0>{}]>{};
+                constexpr auto nIter = number<mn[number<1>{}]>{};
+                CWarpTensor c_warp_tensor;
 
-                    static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
-                        static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
-                            constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
+                static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
+                    static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
+                        constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
 
-                            AWarpTensor a_warp_tensor;
-                            a_warp_tensor.get_thread_buffer() =
-                                a_warp_tile_.get_y_sliced_thread_data(
-                                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                        AWarpTensor a_warp_tensor;
+                        a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                            BWarpTensor b_warp_tensor;
-                            b_warp_tensor.get_thread_buffer() =
-                                b_warp_tile_.get_y_sliced_thread_data(
-                                    merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                        BWarpTensor b_warp_tensor;
+                        b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                            if constexpr(kIterInQScale == 0)
-                            {
-                                c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
-                            }
-                            else
-                            {
-                                WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
-                            }
-                        });
-
-                        constexpr auto tbuf_offset =
-                            number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
-                                       merge_sequences(sequence<mIter, nIter>{},
-                                                       c_warp_y_index_zeros)) /
-                                   CBlockTensor::PackedSize>{};
-
-                        if constexpr(BPreshuffleQuant)
+                        if constexpr(kIterInQScale == 0)
                         {
-                            constexpr index_t reg_offset = [&]() {
-                                if constexpr(GemmTraits::BQuantGroupSize::kN >
-                                                 (NWarp * WarpGemm::kN) &&
-                                             Traits::NPerBlock == GemmTraits::BQuantGroupSize::kN)
-                                {
-                                    return kQScale; // prefill: one quant group per block
-                                }
-                                else
-                                {
-                                    return nIter; // decode or multiple groups per warp
-                                }
-                            }();
-
-                            auto pull_from_lane =
-                                (__lane_id() & (WarpGemm::kN - 1)) * Traits::KQPerBlock + kQScale;
-
-                            auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
-                            // cross lane ops
-                            uint32_t scale_reg_dword;
-
-                            if constexpr(std::is_same_v<BQDataType, float>)
-                            {
-                                scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
-                            }
-                            else
-                            {
-                                scale_reg_dword = static_cast<uint32_t>(scale_reg);
-                            }
-
-                            // cross lane ops to get the value of scale_reg.
-                            int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
-                                pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
-
-                            float scale_reg_f =
-                                Base::cvt_scale_to_fp32<typename Traits::BQDataType>(
-                                    gathered_scale_reg);
-
-                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
-                                [&](auto c_row) {
-                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
-                                        (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
-                                });
+                            c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
                         }
                         else
                         {
-                            // Multiply bquant with accumulated C
-                            constexpr index_t reg_offset = [&]() {
-                                if constexpr(GemmTraits::BQuantGroupSize::kN >=
-                                             (NWarp * WarpGemm::kN))
-                                    return (nIter * NWarp * WarpGemm::kN) /
-                                               GemmTraits::BQuantGroupSize::kN *
-                                               Traits::KQPerBlock +
-                                           kQScale;
-                                else
-                                {
-                                    return nIter * Traits::KQPerBlock + kQScale;
-                                }
-                            }();
-
-                            auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
-                            float scale_reg_f =
-                                Base::cvt_scale_to_fp32<typename Traits::BQDataType>(scale_reg);
-                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
-                                [&](auto c_row) {
-                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
-                                        (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
-                                });
+                            WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
                         }
                     });
+
+                    constexpr auto tbuf_offset =
+                        number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                   merge_sequences(sequence<mIter, nIter>{},
+                                                   c_warp_y_index_zeros)) /
+                               CBlockTensor::PackedSize>{};
+
+                    if constexpr(BPreshuffleQuant)
+                    {
+                        constexpr index_t reg_offset = [&]() {
+                            if constexpr(GemmTraits::BQuantGroupSize::kN > (NWarp * WarpGemm::kN) &&
+                                         Traits::NPerBlock == GemmTraits::BQuantGroupSize::kN)
+                            {
+                                return kQScale; // prefill: one quant group per block
+                            }
+                            else
+                            {
+                                return nIter; // decode or multiple groups per warp
+                            }
+                        }();
+
+                        auto pull_from_lane =
+                            (__lane_id() & (WarpGemm::kN - 1)) * Traits::KQPerBlock + kQScale;
+
+                        auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
+                        // cross lane ops
+                        uint32_t scale_reg_dword;
+
+                        if constexpr(std::is_same_v<BQDataType, float>)
+                        {
+                            scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                        }
+                        else
+                        {
+                            scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                        }
+
+                        // cross lane ops to get the value of scale_reg.
+                        int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                            pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
+
+                        float scale_reg_f = Base::cvt_scale_to_fp32<typename Traits::BQDataType>(
+                            gathered_scale_reg);
+
+                        static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                            [&](auto c_row) {
+                                c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                    (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
+                            });
+                    }
+                    else
+                    {
+                        // Multiply bquant with accumulated C
+                        constexpr index_t reg_offset = [&]() {
+                            if constexpr(GemmTraits::BQuantGroupSize::kN >= (NWarp * WarpGemm::kN))
+                                return (nIter * NWarp * WarpGemm::kN) /
+                                           GemmTraits::BQuantGroupSize::kN * Traits::KQPerBlock +
+                                       kQScale;
+                            else
+                            {
+                                return nIter * Traits::KQPerBlock + kQScale;
+                            }
+                        }();
+
+                        auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
+                        float scale_reg_f =
+                            Base::cvt_scale_to_fp32<typename Traits::BQDataType>(scale_reg);
+                        static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                            [&](auto c_row) {
+                                c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                    (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
+                            });
+                    }
                 });
             });
         }
diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp
index f48e12984c..c87a02efe0 100644
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp
@@ -288,22 +288,22 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe
             MIterPerWarp>
             a_warp_windows_pong;
 
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter              = number<mk[number<0>{}]>{};
+            constexpr auto kIter              = number<mk[number<1>{}]>{};
+            a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
 
-                move_tile_window(a_warp_windows_ping(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(a_warp_windows_ping(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter              = number<mk[number<0>{}]>{};
+            constexpr auto kIter              = number<mk[number<1>{}]>{};
+            a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
 
-                move_tile_window(a_warp_windows_pong(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(a_warp_windows_pong(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 
         // Block GEMM
@@ -366,16 +366,16 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe
         move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
         // prefetch B
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+        static_ford<sequence<NIterPerWarp, KIterPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter              = number<nk[number<0>{}]>{};
+            constexpr auto kIter              = number<nk[number<1>{}]>{};
+            b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});
+            move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                             {nIter * flatNPerWarp, kIter * flatKPerWarp});
 
-                load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_ping(nIter)(kIter),
-                                                    b_flat_dram_windows(nIter)(kIter));
-            });
+            load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_ping(nIter)(kIter),
+                                                b_flat_dram_windows(nIter)(kIter));
         });
         // move B window to next flat K
         move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
@@ -448,15 +448,15 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe
                                     bq_block_tile,
                                     a_warp_windows_ping);
             // prefetch B(2i+1)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
-                    load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_pong(nIter)(kIter),
-                                                        b_flat_dram_windows(nIter)(kIter));
-                });
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});
+                load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_pong(nIter)(kIter),
+                                                    b_flat_dram_windows(nIter)(kIter));
             });
             move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
             aq_block_tile_2 = load_tile(aq_copy_dram_window);
@@ -473,15 +473,15 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe
             // Next K
 
             // prefetch B(2i+2)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
-                    load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_ping(nIter)(kIter),
-                                                        b_flat_dram_windows(nIter)(kIter));
-                });
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});
+                load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_ping(nIter)(kIter),
+                                                    b_flat_dram_windows(nIter)(kIter));
             });
             move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
             aq_block_tile = load_tile(aq_copy_dram_window);
@@ -520,16 +520,16 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe
         if constexpr(TailNum == TailNumber::Even)
         {
             // prefetch B(loopK)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});
 
-                    load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_pong(nIter)(kIter),
-                                                        b_flat_dram_windows(nIter)(kIter));
-                });
+                load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_pong(nIter)(kIter),
+                                                    b_flat_dram_windows(nIter)(kIter));
             });
             aq_block_tile_2 = load_tile(aq_copy_dram_window);
             bq_block_tile_2 = load_tile(bq_copy_dram_window);
diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
index 025ef53dbb..ff98a06662 100644
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
@@ -275,22 +275,22 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
             MIterPerWarp>
             a_warp_windows_pong;
 
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter              = number<mk[number<0>{}]>{};
+            constexpr auto kIter              = number<mk[number<1>{}]>{};
+            a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
 
-                move_tile_window(a_warp_windows_ping(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(a_warp_windows_ping(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
+        static_ford<sequence<MIterPerWarp, KIterPerWarp>>{}([&](auto mk) {
+            constexpr auto mIter              = number<mk[number<0>{}]>{};
+            constexpr auto kIter              = number<mk[number<1>{}]>{};
+            a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
 
-                move_tile_window(a_warp_windows_pong(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
+            move_tile_window(a_warp_windows_pong(mIter)(kIter),
+                             {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
         });
 
         // Block GEMM
@@ -337,16 +337,16 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
         move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
         // prefetch B
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+        static_ford<sequence<NIterPerWarp, KIterPerWarp>>{}([&](auto nk) {
+            constexpr auto nIter              = number<nk[number<0>{}]>{};
+            constexpr auto kIter              = number<nk[number<1>{}]>{};
+            b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});
+            move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                             {nIter * flatNPerWarp, kIter * flatKPerWarp});
 
-                load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_ping(nIter)(kIter),
-                                                    b_flat_dram_windows(nIter)(kIter));
-            });
+            load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_ping(nIter)(kIter),
+                                                b_flat_dram_windows(nIter)(kIter));
         });
         // move B window to next flat K
         move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
@@ -424,15 +424,15 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
                                     bq_block_tile,
                                     a_warp_windows_ping);
             // prefetch B(2i+1)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
-                    load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_pong(nIter)(kIter),
-                                                        b_flat_dram_windows(nIter)(kIter));
-                });
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});
+                load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_pong(nIter)(kIter),
+                                                    b_flat_dram_windows(nIter)(kIter));
             });
             move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
 
@@ -461,15 +461,15 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
             // Next K
 
             // prefetch B(2i+2)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
-                    load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_ping(nIter)(kIter),
-                                                        b_flat_dram_windows(nIter)(kIter));
-                });
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});
+                load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_ping(nIter)(kIter),
+                                                    b_flat_dram_windows(nIter)(kIter));
             });
             move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
 
@@ -518,16 +518,16 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
         if constexpr(TailNum == TailNumber::Even)
         {
             // prefetch B(loopK)
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+            static_ford<sequence<KIterPerWarp, NIterPerWarp>>{}([&](auto kn) {
+                constexpr auto kIter              = number<kn[number<0>{}]>{};
+                constexpr auto nIter              = number<kn[number<1>{}]>{};
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
 
-                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
-                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});
 
-                    load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_pong(nIter)(kIter),
-                                                        b_flat_dram_windows(nIter)(kIter));
-                });
+                load_and_convert_tile<UnaryOpSize_>(b_warp_tensor_pong(nIter)(kIter),
+                                                    b_flat_dram_windows(nIter)(kIter));
             });
             bq_block_tile_2 = load_tile(bq_copy_dram_window);
 
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
index fb82d77fe6..801207106b 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
@@ -531,11 +531,11 @@ struct GroupedConvolutionBackwardDataKernel
 
     static constexpr index_t kBlockSize = GemmPipeline::BlockSize;
 
-    using InDataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using OutDataType = remove_cvref_t<typename GemmPipeline::ADataType>;
     using WeiDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
     using DsDataType  = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
 
-    using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using InDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
     using GroupedConvBwdDataKernelArgsSpecialized =
         GroupedConvBwdDataKernelArgs<GroupedConvTraitsType_, TilePartitioner>;
@@ -561,7 +561,7 @@ struct GroupedConvolutionBackwardDataKernel
         constexpr auto NumGroupsToMerge        = GroupedConvTraitsType_::NumGroupsToMerge;
         // clang-format off
         return concat('_', "grouped_convolution_backward_data", 
-            gemm_prec_str<InDataType, WeiDataType>(), 
+            gemm_prec_str<OutDataType, WeiDataType>(), 
             InLayout::name,
             WeiLayout::name,
             OutLayout::name,
@@ -632,7 +632,7 @@ struct GroupedConvolutionBackwardDataKernel
         const auto& a_pad_view = pad_tensor_view(
             a_tensor_view,
             make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::KPerBlock>{}),
-            sequence<true, true>{});
+            sequence<false, true>{});
 
         // Step 3: Create tile window
         auto a_block_window = make_tile_window(
@@ -644,7 +644,7 @@ struct GroupedConvolutionBackwardDataKernel
     }
 
     CK_TILE_DEVICE static auto
-    MakeBBlockWindow(const InDataType* b_ptr,
+    MakeBBlockWindow(const WeiDataType* b_ptr,
                      const GroupedConvBwdDataKernelArgsSpecialized& kargs,
                      const index_t group_id,
                      const index_t i_n,
@@ -658,7 +658,7 @@ struct GroupedConvolutionBackwardDataKernel
         const auto& b_pad_view = pad_tensor_view(
             b_tensor_view,
             make_tuple(number<TilePartitioner::KPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            sequence<true, true>{});
+            sequence<false, true>{});
 
         // Step 3: Create tile window
         auto b_block_window = make_tile_window(
@@ -681,14 +681,14 @@ struct GroupedConvolutionBackwardDataKernel
             [&](auto i) {
                 // Step 1: Create tensor view for D
                 const auto& d_tensor_view = make_tensor_view<address_space_enum::global>(
-                    static_cast<const OutDataType*>(ds_ptr[i]), kargs.c_grid_descs_m_n[group_id]);
+                    static_cast<const InDataType*>(ds_ptr[i]), kargs.c_grid_descs_m_n[group_id]);
 
                 // Step 2: Create padded view
                 const auto& d_pad_view =
                     pad_tensor_view(d_tensor_view,
                                     make_tuple(number<TilePartitioner::MPerBlock>{},
                                                number<TilePartitioner::NPerBlock>{}),
-                                    sequence<true, true>{});
+                                    sequence<false, true>{});
 
                 // Step 3: Create tile window
                 return make_tile_window(d_pad_view,
@@ -703,7 +703,7 @@ struct GroupedConvolutionBackwardDataKernel
 
     template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
     CK_TILE_DEVICE static auto
-    MakeCBlockWindow(WeiDataType* c_ptr,
+    MakeCBlockWindow(InDataType* c_ptr,
                      const GroupedConvBwdDataKernelArgsSpecialized& kargs,
                      const index_t group_id,
                      const index_t i_m,
@@ -713,11 +713,20 @@ struct GroupedConvolutionBackwardDataKernel
         const auto& c_tensor_view = make_tensor_view<address_space_enum::global, DstInMemOp>(
             c_ptr, kargs.c_grid_descs_m_n[group_id]);
 
+        // For bf16_t and atomic_add global_atomic_add is used instead of buffer_atomic_add
+        // Add padding for not contiguous dim due to the lack of OOB check
+        // Not needed from gfx950.
+#if defined(__gfx950__)
+        constexpr bool pad_not_contiguous_dim = false;
+#else
+        constexpr bool pad_not_contiguous_dim =
+            std::is_same_v<InDataType, bf16_t> && DstInMemOp == memory_operation_enum::atomic_add;
+#endif
         // Step 2: Create padded view
         const auto& c_pad_view = pad_tensor_view(
             c_tensor_view,
             make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            sequence<true, true>{});
+            sequence<pad_not_contiguous_dim, true>{});
 
         // Step 3: Create tile window
         auto c_block_window = make_tile_window(
@@ -739,7 +748,7 @@ struct GroupedConvolutionBackwardDataKernel
             }
         }
         if constexpr(GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
-                     is_any_of<OutDataType, fp16_t, bf16_t>::value)
+                     is_any_of<InDataType, fp16_t, bf16_t>::value)
         {
             if(kargs.k_batch != 1)
             {
@@ -862,133 +871,6 @@ struct GroupedConvolutionBackwardDataKernel
         return true;
     }
 
-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
-    CK_TILE_DEVICE static auto
-    MakeGemmTensorViews(const OutDataType* a_ptr,
-                        const InDataType* b_ptr,
-                        const std::array<const void*, NumDTensor>& ds_ptr,
-                        WeiDataType* c_ptr,
-                        const GroupedConvBwdDataKernelArgsSpecialized& kargs,
-                        const index_t group_id)
-    {
-        static_assert(!GemmPipeline::BlockGemmShape::PermuteA, "Not implemented!");
-        static_assert(!GemmPipeline::BlockGemmShape::PermuteB, "Not implemented!");
-        const auto& a_tensor_view = [&]() {
-            return make_tensor_view<address_space_enum::global>(
-                a_ptr,
-                kargs.a_grid_descs_m_k[group_id]); // A: out
-        }();
-
-        const auto& b_tensor_view = [&]() {
-            return make_tensor_view<address_space_enum::global>(
-                b_ptr,
-                kargs.b_grid_descs_n_k[group_id]); // B: weight
-        }();
-
-        const auto& c_tensor_view = [&]() {
-            return make_tensor_view<address_space_enum::global, DstInMemOp>(
-                c_ptr, kargs.c_grid_descs_m_n[group_id]);
-        }();
-
-        const auto& ds_tensor_view = generate_tuple(
-            [&](auto i) {
-                static_assert(std::is_same_v<std::tuple_element_t<i, DsLayout>, OutLayout>,
-                              "Not supported!");
-                static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>,
-                              "Not supported!");
-                static_assert(std::is_same_v<std::tuple_element_t<i, DsDataType>, OutDataType>,
-                              "Not supported!");
-
-                return make_tensor_view<address_space_enum::global>(
-                    static_cast<OutDataType*>(ds_ptr[i]), kargs.c_grid_descs_m_n[group_id]);
-            },
-            number<NumDTensor>{});
-
-        return make_tuple(a_tensor_view, b_tensor_view, ds_tensor_view, c_tensor_view);
-    }
-
-    template <typename TensorView>
-    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
-    {
-        const auto& a_pad_view = [&]() {
-            const auto& a_tensor_view = views.at(I0);
-            return pad_tensor_view(a_tensor_view,
-                                   make_tuple(number<TilePartitioner::MPerBlock>{},
-                                              number<TilePartitioner::KPerBlock>{}),
-                                   sequence<true, true>{});
-        }();
-
-        const auto& b_pad_view = [&]() {
-            const auto& b_tensor_view = views.at(I1);
-            return pad_tensor_view(b_tensor_view,
-                                   make_tuple(number<TilePartitioner::KPerBlock>{},
-                                              number<TilePartitioner::NPerBlock>{}),
-                                   sequence<true, true>{});
-        }();
-
-        const auto& ds_tensor_view = views.at(I2);
-        const auto& ds_pad_view    = generate_tuple(
-            [&](auto i) {
-                return pad_tensor_view(ds_tensor_view[i],
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<true, true>{});
-            },
-            number<NumDTensor>{});
-
-        const auto& c_pad_view = [&]() {
-            const auto& c_tensor_view = views.at(I3);
-            return pad_tensor_view(c_tensor_view,
-                                   make_tuple(number<TilePartitioner::MPerBlock>{},
-                                              number<TilePartitioner::NPerBlock>{}),
-                                   sequence<true, true>{});
-        }();
-
-        return make_tuple(a_pad_view, b_pad_view, ds_pad_view, c_pad_view);
-    }
-
-    template <typename PadView>
-    CK_TILE_DEVICE static auto MakeGemmTileWindows(const PadView& views,
-                                                   const index_t i_m,
-                                                   const index_t i_n,
-                                                   const index_t i_k)
-    {
-        const auto& a_pad_view  = views.at(I0);
-        const auto& b_pad_view  = views.at(I1);
-        const auto& ds_pad_view = views.at(I2);
-        const auto& c_pad_view  = views.at(I3);
-
-        const auto& a_block_window = [&]() {
-            return make_tile_window(a_pad_view,
-                                    make_tuple(number<TilePartitioner::MPerBlock>{},
-                                               number<TilePartitioner::KPerBlock>{}),
-                                    {i_m, i_k});
-        }();
-
-        const auto& b_block_window = [&]() {
-            return make_tile_window(b_pad_view,
-                                    make_tuple(number<TilePartitioner::KPerBlock>{},
-                                               number<TilePartitioner::NPerBlock>{}),
-                                    {i_k, i_n});
-        }();
-
-        const auto ds_block_window = generate_tuple(
-            [&](auto i) {
-                return make_tile_window(ds_pad_view[i],
-                                        make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                   number<TilePartitioner::NPerBlock>{}),
-                                        {i_m, i_n});
-            },
-            number<NumDTensor>{});
-
-        auto c_block_window = make_tile_window(
-            c_pad_view,
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            {i_m, i_n});
-
-        return make_tuple(a_block_window, b_block_window, ds_block_window, c_block_window);
-    }
-
     /**
      * @brief Runs single GEMM problem cooperatively by whole workgroup.
      *
@@ -1002,9 +884,9 @@ struct GroupedConvolutionBackwardDataKernel
      *
      */
     CK_TILE_DEVICE static void RunGemm(const OutDataType* a_ptr,
-                                       const InDataType* b_ptr,
+                                       const WeiDataType* b_ptr,
                                        const std::array<const void*, NumDTensor>& ds_ptr,
-                                       WeiDataType* c_ptr,
+                                       InDataType* c_ptr,
                                        void* smem_ptr_0,
                                        const GroupedConvBwdDataKernelArgsSpecialized& kargs,
                                        const index_t splitted_k,
@@ -1044,7 +926,7 @@ struct GroupedConvolutionBackwardDataKernel
         else
         {
             if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 &&
-                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+                           is_any_of<InDataType, fp16_t, bf16_t>::value))
             {
                 auto c_block_window = MakeCBlockWindow<memory_operation_enum::atomic_add>(
                     c_ptr, kargs, group_id, block_idx_m, block_idx_n);
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
index 39c7ba1370..0c21881b1f 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -16,12 +16,25 @@
 
 #include "ck_tile/ops/grouped_convolution/utils/split_k_utils.hpp"
 
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_coherency.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
+
 #ifdef CK_EXPERIMENTAL_BUILDER
 #include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp"
 #endif
 
 namespace ck_tile {
 
+template <typename T>
+struct is_streamk_partitioner : std::false_type
+{
+};
+template <typename Shape, StreamKReductionStrategy S, bool P>
+struct is_streamk_partitioner<StreamKTilePartitioner<Shape, S, P>> : std::true_type
+{
+};
+
 template <typename... Args>
 CK_TILE_HOST void LogInfo(Args&&... args) noexcept
 {
@@ -32,7 +45,7 @@ CK_TILE_HOST void LogInfo(Args&&... args) noexcept
 }
 
 /// @brief The Grouped Convolution kernel device arguments.
-template <typename GroupedConvTraitsType_>
+template <typename GroupedConvTraitsType_, typename TilePartitioner_ = void>
 struct GroupedConvBwdWeightKernelArgs
 {
 
@@ -354,6 +367,23 @@ struct GroupedConvBwdWeightKernelArgs
     long_index_t group_stride_a;
     long_index_t group_stride_b;
     long_index_t group_stride_c;
+
+    void* workspace_ptr = nullptr;
+
+    // StreamK tile partitioner — stored directly when TilePartitioner_ is a real type,
+    // empty struct when void (Split-K path). Constructed with dummy values here;
+    // properly initialized in MakeKernelArgs before device-side use.
+    struct EmptyPartitioner
+    {
+    };
+    using PartitionerType =
+        std::conditional_t<std::is_void_v<TilePartitioner_>, EmptyPartitioner, TilePartitioner_>;
+    PartitionerType tile_partitioner = []() {
+        if constexpr(std::is_void_v<TilePartitioner_>)
+            return EmptyPartitioner{};
+        else
+            return TilePartitioner_(1, 1, 1, 1);
+    }();
 };
 
 /// @brief The Grouped Convolution Backward Weight kernel template.
@@ -424,10 +454,15 @@ struct GroupedConvolutionBackwardWeightKernel
     using DsDataType  = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
     using WeiDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
-    using GroupedConvBwdWeightKernelArgsSpecialized =
-        GroupedConvBwdWeightKernelArgs<GroupedConvTraitsType_>;
-
     static constexpr bool IsSplitKSupported = true;
+    static constexpr bool IsStreamK         = is_streamk_partitioner<TilePartitioner>::value;
+
+    using GroupedConvBwdWeightKernelArgsSpecialized =
+        std::conditional_t<IsStreamK,
+                           GroupedConvBwdWeightKernelArgs<GroupedConvTraitsType_, TilePartitioner>,
+                           GroupedConvBwdWeightKernelArgs<GroupedConvTraitsType_>>;
+
+    using AccDataType = remove_cvref_t<typename EpiloguePipeline::AccDataType>;
 
     static constexpr auto I0 = number<0>();
     static constexpr auto I1 = number<1>();
@@ -442,6 +477,32 @@ struct GroupedConvolutionBackwardWeightKernel
     static_assert(GroupedConvTraitsType_::ExplicitGemm == false ||
                       GroupedConvTraitsType_::NumGroupsToMerge == 1,
                   "Not supported!");
+    static_assert(!IsStreamK || NumDTensor == 0,
+                  "D tensor per-group offsets not implemented for StreamK path");
+
+    // StreamK reduction helpers (partial store/load, flag signaling, tile accumulation).
+    // Shared with the StreamK GEMM kernel via StreamKReductionOps in streamk_common.hpp.
+    using StreamKOps = StreamKReductionOps<TilePartitioner,
+                                           GemmPipeline,
+                                           GroupedConvBwdWeightKernelArgsSpecialized>;
+
+    CK_TILE_HOST static index_t
+    GetWorkSpaceSize(const GroupedConvBwdWeightKernelArgsSpecialized& kargs)
+    {
+        if constexpr(IsStreamK)
+            return kargs.tile_partitioner.get_workspace_size(sizeof(AccDataType)) * kargs.GemmBatch;
+        else
+            return 0;
+    }
+
+    // Post-construction setter: workspace is allocated by the caller after
+    // GetWorkSpaceSize() and must outlive the kernel launch. Can't be moved into
+    // the constructor because kargs is a POD value type copied to GPU constant memory.
+    CK_TILE_HOST static void SetWorkSpacePointer(GroupedConvBwdWeightKernelArgsSpecialized& kargs,
+                                                 void* workspace_ptr)
+    {
+        kargs.workspace_ptr = workspace_ptr;
+    }
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
@@ -463,7 +524,8 @@ struct GroupedConvolutionBackwardWeightKernel
             "SplitImage",
             EnableSplitImage,
             "ExplicitGemm",
-            GroupedConvTraitsType_::ExplicitGemm
+            GroupedConvTraitsType_::ExplicitGemm,
+            IsStreamK ? "StreamK" : "SplitK"
         );
         // clang-format on
     }
@@ -483,11 +545,17 @@ struct GroupedConvolutionBackwardWeightKernel
     }
 #endif
 
-    CK_TILE_HOST static constexpr auto
-    GridSize(const GroupedConvBwdWeightKernelArgsSpecialized& kargs)
+    CK_TILE_HOST static auto GridSize(const GroupedConvBwdWeightKernelArgsSpecialized& kargs)
     {
-        return dim3(
-            TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN), kargs.GemmBatch, kargs.k_batch);
+        if constexpr(IsStreamK)
+        {
+            auto sk_grid = kargs.tile_partitioner.grid_size();
+            return dim3(sk_grid.x, kargs.GemmBatch, 1);
+        }
+        else
+            return dim3(TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN),
+                        kargs.GemmBatch,
+                        kargs.k_batch);
     }
 
     CK_TILE_HOST static constexpr auto BlockSize()
@@ -495,8 +563,10 @@ struct GroupedConvolutionBackwardWeightKernel
         return is_wave32() ? dim3(kBlockSize / 2) : dim3(kBlockSize);
     }
 
-    CK_TILE_HOST static constexpr GroupedConvBwdWeightKernelArgsSpecialized
-    MakeKernelArgs(const GroupedConvBwdWeightHostArgs& hostArgs)
+    CK_TILE_HOST static GroupedConvBwdWeightKernelArgsSpecialized
+    MakeKernelArgs(const GroupedConvBwdWeightHostArgs& hostArgs,
+                   [[maybe_unused]] int num_cu    = 0,
+                   [[maybe_unused]] int occupancy = 0)
     {
         LogInfo("MPerBlock: ",
                 number<TilePartitioner::MPerBlock>{},
@@ -507,18 +577,42 @@ struct GroupedConvolutionBackwardWeightKernel
 
         auto kernel_args = GroupedConvBwdWeightKernelArgsSpecialized(hostArgs);
 
-        using KernelImpl = GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType_,
-                                                                  TilePartitioner_,
-                                                                  GemmPipeline_,
-                                                                  EpiloguePipeline_>;
-
-        // Negative k_batch value: split-K autodeduction.
-        if(kernel_args.k_batch < 0)
+        if constexpr(IsStreamK)
         {
-            const auto optimal_split_k =
-                calculate_optimal_k_batch<GemmPipeline_::BlockSize, KernelImpl, TilePartitioner_>(
-                    kernel_args);
-            kernel_args.k_batch = optimal_split_k;
+            // StreamK: construct tile partitioner and embed it in the args.
+            // Use provided num_cu/occupancy, or query HW.
+            if(num_cu == 0)
+            {
+                hipDeviceProp_t dev_prop;
+                hipDevice_t dev;
+                ck_tile::hip_check_error(hipGetDevice(&dev));
+                ck_tile::hip_check_error(hipGetDeviceProperties(&dev_prop, dev));
+                num_cu = dev_prop.multiProcessorCount;
+            }
+            if(occupancy == 0)
+                occupancy = 1; // conservative default; caller may use hipOccupancy API
+
+            const index_t grid = num_cu * occupancy;
+            kernel_args.tile_partitioner =
+                TilePartitioner(kernel_args.GemmM, kernel_args.GemmN, kernel_args.GemmK, grid);
+            kernel_args.k_batch = 1; // StreamK does its own K distribution
+        }
+        else
+        {
+            using KernelImpl = GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType_,
+                                                                      TilePartitioner_,
+                                                                      GemmPipeline_,
+                                                                      EpiloguePipeline_>;
+
+            // Negative k_batch value: split-K autodeduction.
+            if(kernel_args.k_batch < 0)
+            {
+                const auto optimal_split_k =
+                    calculate_optimal_k_batch<GemmPipeline_::BlockSize,
+                                              KernelImpl,
+                                              TilePartitioner_>(kernel_args);
+                kernel_args.k_batch = optimal_split_k;
+            }
         }
 
         return kernel_args;
@@ -539,6 +633,21 @@ struct GroupedConvolutionBackwardWeightKernel
                 return false;
             }
         }
+        // Runtime arch check — complements the static_assert in operator().
+        // Both are needed: this check runs on the host (where get_compiler_target()
+        // isn't available since HIP's host pass doesn't define __gfx*__ macros),
+        // while the static_assert in operator() catches misuse at device compile time.
+        if constexpr(IsStreamK)
+        {
+            const auto name = get_device_name();
+            if(name != "gfx90a" && name != "gfx942" && name != "gfx950")
+            {
+                LogInfo("StreamK requires cross-CU buffer coherence. "
+                        "Supported: gfx90a, gfx942, gfx950. Got: ",
+                        name);
+                return false;
+            }
+        }
         if(kargs.k_batch < 1)
         {
             LogInfo("k_batch must be at least one. Ensure argument is created via MakeKernelArgs.");
@@ -574,15 +683,20 @@ struct GroupedConvolutionBackwardWeightKernel
             }
         }
 
-        if(kargs.GemmK < TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)
+        if constexpr(!IsStreamK)
         {
-            LogInfo("KBatch is too large, part of GPU wouldn't be utilized! GemmK: ",
-                    kargs.GemmK,
-                    ", BlockGemmShape K: ",
-                    TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}),
-                    ", k_batch: ",
-                    kargs.k_batch);
-            return false;
+            if(integer_divide_ceil(kargs.GemmK,
+                                   TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{})) <
+               kargs.k_batch)
+            {
+                LogInfo("KBatch is too large, part of GPU wouldn't be utilized! GemmK: ",
+                        kargs.GemmK,
+                        ", BlockGemmShape K: ",
+                        TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}),
+                        ", k_batch: ",
+                        kargs.k_batch);
+                return false;
+            }
         }
 
         const index_t ConvK = kargs.wei_g_k_c_xs_lengths[number<1>{}];
@@ -755,10 +869,19 @@ struct GroupedConvolutionBackwardWeightKernel
         const auto& c_tensor_view =
             make_tensor_view<address_space_enum::global, DstInMemOp>(c_ptr, kargs.c_grid_desc_m_n);
 
+        // For bf16_t and atomic_add global_atomic_add is used instead of buffer_atomic_add
+        // Add padding for not contiguous dim due to the lack of OOB check
+        // Not needed from gfx950.
+#if defined(__gfx950__)
+        constexpr bool pad_not_contiguous_dim = false;
+#else
+        constexpr bool pad_not_contiguous_dim =
+            std::is_same_v<WeiDataType, bf16_t> && DstInMemOp == memory_operation_enum::atomic_add;
+#endif
         const auto& c_pad_view = pad_tensor_view(
             c_tensor_view,
             make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            sequence<true, true>{});
+            sequence<pad_not_contiguous_dim, true>{});
 
         return make_tile_window(
             c_pad_view,
@@ -791,7 +914,7 @@ struct GroupedConvolutionBackwardWeightKernel
                 return pad_tensor_view(ds_tensor_view[i],
                                        make_tuple(number<TilePartitioner::MPerBlock>{},
                                                   number<TilePartitioner::NPerBlock>{}),
-                                       sequence<true, true>{});
+                                       sequence<false, true>{});
             },
             number<NumDTensor>{});
 
@@ -819,7 +942,7 @@ struct GroupedConvolutionBackwardWeightKernel
             pad_tensor_view(b_tensor_view,
                             make_tuple(number<TilePartitioner::KPerBlock>{} * kargs.k_batch,
                                        number<TilePartitioner::NPerBlock>{}),
-                            sequence<true, true>{});
+                            sequence<false, true>{});
 
         return make_tile_window(
             b_pad_view,
@@ -841,7 +964,7 @@ struct GroupedConvolutionBackwardWeightKernel
             pad_tensor_view(a_tensor_view,
                             make_tuple(number<TilePartitioner::KPerBlock>{} * kargs.k_batch,
                                        number<TilePartitioner::MPerBlock>{}),
-                            sequence<true, true>{});
+                            sequence<false, true>{});
 
         return make_tile_window(
             a_pad_view,
@@ -927,9 +1050,239 @@ struct GroupedConvolutionBackwardWeightKernel
         ExplicitBatchedGemmKernel{}(batched_gemm_kargs);
     }
 
+    CK_TILE_DEVICE void RunStreamK(GroupedConvBwdWeightKernelArgsSpecialized& kargs) const
+    {
+        // Device-side compile-time arch check — complements the runtime check in
+        // IsSupportedArgument(). Both are needed: the runtime check runs on the host
+        // (where get_compiler_target() isn't available since HIP's host pass doesn't
+        // define __gfx*__ macros), while this catches misuse at device compile time.
+        static_assert(
+            StreamKCoherency<decltype(core::arch::get_compiler_target())>::BUFFER_COHERENCE !=
+                amd_buffer_coherence_enum::coherence_default,
+            "StreamK requires cross-CU buffer coherence (StreamKCoherency specialization). "
+            "Currently supported: gfx90a, gfx942, gfx950.");
+
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        // Group offset (blockIdx.y = group batch index)
+        const auto blockIdY       = amd_wave_read_first_lane(blockIdx.y);
+        const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
+
+        const OutDataType* a_ptr = static_cast<const OutDataType*>(kargs.out_ptr) + group_offset_a;
+        const InDataType* b_ptr  = static_cast<const InDataType*>(kargs.in_ptr) + group_offset_b;
+        WeiDataType* c_ptr       = static_cast<WeiDataType*>(kargs.wei_ptr) + group_offset_c;
+
+        // Offset workspace per group so groups don't interfere.
+        // Safe to mutate kargs: on GPU each workgroup operates on its own
+        // register-local copy of the kernel arguments.
+        const auto per_group_ws_size =
+            kargs.tile_partitioner.get_workspace_size(sizeof(AccDataType));
+        kargs.workspace_ptr =
+            static_cast<char*>(kargs.workspace_ptr) + blockIdY * per_group_ws_size;
+
+        const index_t dp_num_loop = kargs.tile_partitioner.get_iters_per_tile();
+
+        StreamKDispatch(
+            kargs.tile_partitioner,
+            [&](index_t tile_idx) {
+                // Data-parallel workgroup: process one full tile
+                const auto tile_mn = kargs.tile_partitioner.get_output_tile_index(tile_idx);
+                const index_t i_m =
+                    amd_wave_read_first_lane(tile_mn[I0] * TilePartitioner::MPerBlock);
+                const index_t i_n =
+                    amd_wave_read_first_lane(tile_mn[I1] * TilePartitioner::NPerBlock);
+
+                RunGemm(a_ptr,
+                        b_ptr,
+                        kargs.ds_ptr,
+                        c_ptr,
+                        smem_ptr,
+                        kargs,
+                        dp_num_loop,
+                        i_m,
+                        i_n,
+                        /*block_idx_k=*/0);
+            },
+            [&](index_t sk_cta_idx) {
+                RunStreamKLoop(kargs, sk_cta_idx, a_ptr, b_ptr, c_ptr, smem_ptr);
+            });
+    }
+
+    /// @brief Stream-K loop: iterate over assigned K-iterations, run GEMM pipeline,
+    ///        and perform Linear or Tree reduction to accumulate partial results.
+    CK_TILE_DEVICE void RunStreamKLoop(GroupedConvBwdWeightKernelArgsSpecialized& kargs,
+                                       index_t sk_cta_idx,
+                                       const OutDataType* a_ptr,
+                                       const InDataType* b_ptr,
+                                       WeiDataType* c_ptr,
+                                       char* smem_ptr) const
+    {
+        const StreamKOps sk_ops{};
+
+        index_t iter_start, iter_end;
+        kargs.tile_partitioner.get_iter_boundaries(iter_start, iter_end, sk_cta_idx);
+
+        while(iter_start < iter_end)
+        {
+            index_t tile_idx =
+                amd_wave_read_first_lane(kargs.tile_partitioner.get_tile_index(iter_start));
+
+            index_t tile_iter_start, tile_iter_end;
+            kargs.tile_partitioner.get_tile_boundaries(tile_iter_start, tile_iter_end, tile_idx);
+
+            index_t local_iter_start = amd_wave_read_first_lane(
+                kargs.tile_partitioner.get_local_iter(iter_start, tile_iter_start));
+            index_t local_iter_end =
+                amd_wave_read_first_lane(kargs.tile_partitioner.get_local_iter_end(
+                    tile_iter_start, iter_end, tile_iter_end));
+
+            index_t num_loop_sk = local_iter_end - local_iter_start;
+
+            // Compute M/N tile indices from 1D tile index
+            const auto c_macro_tile_idx = kargs.tile_partitioner.get_output_tile_index(tile_idx);
+            const index_t i_m =
+                amd_wave_read_first_lane(c_macro_tile_idx[I0] * TilePartitioner::MPerBlock);
+            const index_t i_n =
+                amd_wave_read_first_lane(c_macro_tile_idx[I1] * TilePartitioner::NPerBlock);
+
+            // K offset = local_iter_start * KPerBlock
+            const index_t i_k =
+                amd_wave_read_first_lane(local_iter_start * TilePartitioner::KPerBlock);
+
+            // Create block windows and run pipeline
+            const auto& a_block_window = MakeABlockWindow(a_ptr, kargs, i_m, i_k);
+            const auto& b_block_window = MakeBBlockWindow(b_ptr, kargs, i_n, i_k);
+            const auto& d_block_window = MakeDBlockWindows(kargs.ds_ptr, kargs, i_m, i_n);
+
+            const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop_sk);
+            const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop_sk);
+
+            const auto& c_block_tile = GemmPipeline{}.template operator()(
+                a_block_window, b_block_window, num_loop_sk, has_hot_loop, tail_num, smem_ptr);
+
+            auto tile_started = iter_start == tile_iter_start;
+            auto tile_ended   = iter_end >= tile_iter_end;
+
+            if constexpr(TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Linear)
+            {
+                // Linear Reduction: tile-starter sequentially accumulates all
+                // partials from subsequent CTAs in order.
+                if(!tile_started)
+                {
+                    sk_ops.StorePartial(kargs, sk_cta_idx, c_block_tile);
+                    sk_ops.SignalStorePartialDone(kargs, sk_cta_idx);
+                }
+                else
+                {
+                    auto accum_block_tile = c_block_tile;
+                    if(!tile_ended)
+                    {
+                        const index_t iter_per_tile = kargs.tile_partitioner.get_iters_per_tile();
+                        const index_t iter_per_cta  = kargs.tile_partitioner.get_iters_per_sk_cta();
+                        const index_t extra_iters   = kargs.tile_partitioner.get_extra_iters();
+                        int accum_iters             = local_iter_end - local_iter_start;
+                        int next_cta                = sk_cta_idx + 1;
+
+                        while(accum_iters < iter_per_tile)
+                        {
+                            sk_ops.WaitStorePartialDone(kargs, next_cta);
+
+                            using BlockType = remove_cvref_t<decltype(c_block_tile)>;
+                            sk_ops.AddBlockTile(
+                                accum_block_tile,
+                                sk_ops.template LoadPartial<typename BlockType::DataType>(
+                                    kargs, next_cta, c_block_tile.get_tile_distribution()));
+
+                            accum_iters += iter_per_cta + (next_cta < extra_iters);
+                            ++next_cta;
+                        }
+                    }
+
+                    auto c_block_window_out =
+                        MakeCBlockWindow<memory_operation_enum::set>(c_ptr, kargs, i_m, i_n);
+                    EpiloguePipeline{}(
+                        c_block_window_out, accum_block_tile, d_block_window, smem_ptr);
+                }
+            }
+            else if constexpr(TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Tree)
+            {
+                // Tree Reduction: pairwise reduction with stride doubling.
+                // At each round, half the CTAs store their accumulated partial
+                // and exit; the other half load and accumulate from their partner.
+                // The tile-starter writes the final result.
+                auto accum_block_tile      = c_block_tile;
+                index_t tile_local_cta_idx = amd_wave_read_first_lane(
+                    kargs.tile_partitioner.get_tile_local_cta_index(tile_iter_start, sk_cta_idx));
+
+                index_t stride = amd_wave_read_first_lane(1);
+
+                for(;; stride <<= 1)
+                {
+                    // Partner index is a *global* SK CTA index. This works because
+                    // CTAs contributing to the same tile always have contiguous global
+                    // SK CTA indices (guaranteed by the partitioner's iteration assignment).
+                    const index_t partner_cta_idx = amd_wave_read_first_lane(sk_cta_idx + stride);
+                    const index_t partner_start_iter = amd_wave_read_first_lane(
+                        kargs.tile_partitioner.get_start_iter(partner_cta_idx));
+                    bool partner_in_tile =
+                        amd_wave_read_first_lane(partner_start_iter < tile_iter_end);
+
+                    // If the partner of the tile-starter is not in this tile,
+                    // then all partials are accumulated — write final result.
+                    if(tile_started && !partner_in_tile)
+                    {
+                        auto c_block_window_out =
+                            MakeCBlockWindow<memory_operation_enum::set>(c_ptr, kargs, i_m, i_n);
+                        EpiloguePipeline{}(
+                            c_block_window_out, accum_block_tile, d_block_window, smem_ptr);
+                        break;
+                    }
+
+                    // This CTA's turn to read from its partner and accumulate.
+                    if(tile_local_cta_idx % (stride << 1) == 0)
+                    {
+                        if(partner_in_tile)
+                        {
+                            sk_ops.WaitStorePartialDone(kargs, partner_cta_idx);
+                            using BlockType = remove_cvref_t<decltype(c_block_tile)>;
+                            sk_ops.AddBlockTile(
+                                accum_block_tile,
+                                sk_ops.template LoadPartial<typename BlockType::DataType>(
+                                    kargs, partner_cta_idx, c_block_tile.get_tile_distribution()));
+                        }
+                    }
+                    // This CTA's turn to write its partial and exit.
+                    else
+                    {
+                        sk_ops.StorePartial(kargs, sk_cta_idx, accum_block_tile);
+                        sk_ops.SignalStorePartialDone(kargs, sk_cta_idx);
+                        break;
+                    }
+                }
+            }
+            else
+            {
+                static_assert(
+                    TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Linear ||
+                        TilePartitioner::ReductionStrategy == StreamKReductionStrategy::Tree,
+                    "Unsupported StreamK reduction strategy for conv bwd weight.");
+            }
+
+            // Advance to next tile
+            iter_start = tile_iter_end;
+            block_sync_lds();
+        }
+    }
+
     CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized& kargs) const
     {
-        if constexpr(GroupedConvTraitsType_::ExplicitGemm)
+        if constexpr(IsStreamK)
+        {
+            RunStreamK(kargs);
+        }
+        else if constexpr(GroupedConvTraitsType_::ExplicitGemm)
         {
             CallExplicitGemm(kargs);
         }
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
index bbbd248787..be8fe12f1b 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -513,7 +513,9 @@ struct GroupedConvolutionForwardKernel
 
     static_assert(GemmPipeline::kPadM && GemmPipeline::kPadN && GemmPipeline::kPadK,
                   "Not supported!");
-    static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>, "Not supported!");
+    static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor> ||
+                      GroupedConvTraitsType_::NumGroupsToMerge > 1,
+                  "Not supported!");
     static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::ColumnMajor>, "Not supported!");
     static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>, "Not supported!");
     static_assert(GroupedConvTraitsType_::ExplicitGemm == false ||
@@ -885,20 +887,51 @@ struct GroupedConvolutionForwardKernel
     CK_TILE_DEVICE static auto
     MakeABlockWindow(const InDataType* a_ptr, const ADescType& a_desc, const index_t block_idx_m)
     {
-        // Step 1: Create tensor view
-        const auto& a_tensor_view = make_tensor_view<address_space_enum::global>(a_ptr, a_desc);
+        if constexpr(GroupedConvTraitsType_::NumGroupsToMerge == 1)
+        {
+            // Access by K
+            // Step 1: Create tensor view
+            const auto& a_tensor_view = make_tensor_view<address_space_enum::global>(a_ptr, a_desc);
 
-        // Step 2: Create padded view
-        const auto& a_pad_view = pad_tensor_view(
-            a_tensor_view,
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::KPerBlock>{}),
-            sequence<true, true>{});
+            // Step 2: Create padded view
+            const auto& a_pad_view =
+                pad_tensor_view(a_tensor_view,
+                                make_tuple(number<TilePartitioner::MPerBlock>{},
+                                           number<TilePartitioner::KPerBlock>{}),
+                                sequence<false, true>{});
 
-        // Step 3: Create tile window
-        return make_tile_window(
-            a_pad_view,
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::KPerBlock>{}),
-            {block_idx_m, 0});
+            // Step 3: Create tile window
+            return make_tile_window(a_pad_view,
+                                    make_tuple(number<TilePartitioner::MPerBlock>{},
+                                               number<TilePartitioner::KPerBlock>{}),
+                                    {block_idx_m, 0});
+        }
+        else
+        {
+            // Access by M
+            const auto a_desc_reversed = transform_tensor_descriptor(
+                a_desc,
+                make_tuple(make_pass_through_transform(a_desc.get_length(I0)),
+                           make_pass_through_transform(a_desc.get_length(I1))),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<1>{}, sequence<0>{}));
+            // Step 1: Create tensor view
+            const auto& a_tensor_view =
+                make_tensor_view<address_space_enum::global>(a_ptr, a_desc_reversed);
+
+            // Step 2: Create padded view
+            const auto& a_pad_view =
+                pad_tensor_view(a_tensor_view,
+                                make_tuple(number<TilePartitioner::KPerBlock>{},
+                                           number<TilePartitioner::MPerBlock>{}),
+                                sequence<false, true>{});
+
+            // Step 3: Create tile window
+            return make_tile_window(a_pad_view,
+                                    make_tuple(number<TilePartitioner::KPerBlock>{},
+                                               number<TilePartitioner::MPerBlock>{}),
+                                    {0, block_idx_m});
+        }
     }
 
     template <typename BDescType>
@@ -912,7 +945,7 @@ struct GroupedConvolutionForwardKernel
         const auto& b_pad_view = pad_tensor_view(
             b_tensor_view,
             make_tuple(number<TilePartitioner::NPerBlock>{}, number<TilePartitioner::KPerBlock>{}),
-            sequence<true, true>{});
+            sequence<false, true>{});
 
         // Step 3: Create tile window
         return make_tile_window(
@@ -948,7 +981,7 @@ struct GroupedConvolutionForwardKernel
                 return pad_tensor_view(ds_tensor_view[i],
                                        make_tuple(number<TilePartitioner::MPerBlock>{},
                                                   number<TilePartitioner::NPerBlock>{}),
-                                       sequence<true, true>{});
+                                       sequence<false, true>{});
             },
             number<NumDTensor>{});
 
@@ -973,11 +1006,20 @@ struct GroupedConvolutionForwardKernel
         const auto& c_tensor_view =
             make_tensor_view<address_space_enum::global, DstInMemOp>(c_ptr, c_desc);
 
+        // For bf16_t and atomic_add global_atomic_add is used instead of buffer_atomic_add
+        // Add padding for not contiguous dim due to the lack of OOB check
+        // Not needed from gfx950.
+#if defined(__gfx950__)
+        constexpr bool pad_not_contiguous_dim = false;
+#else
+        constexpr bool pad_not_contiguous_dim =
+            std::is_same_v<OutDataType, bf16_t> && DstInMemOp == memory_operation_enum::atomic_add;
+#endif
         // Step 2: Create padded view
         const auto& c_pad_view = pad_tensor_view(
             c_tensor_view,
             make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            sequence<true, true>{});
+            sequence<pad_not_contiguous_dim, true>{});
 
         // Step 3: Create tile window
         return make_tile_window(
diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
index 5b00e53af8..2efb435d5b 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -108,7 +108,9 @@ struct GroupedConvTraits
     using OutLayout                                               = OutLayout_;
 
     // Forward Gemm Layouts
-    using AsLayoutFwd = ck_tile::tensor_layout::gemm::RowMajor;
+    using AsLayoutFwd = std::conditional_t<NumGroupsToMerge == 1,
+                                           ck_tile::tensor_layout::gemm::RowMajor,
+                                           ck_tile::tensor_layout::gemm::ColumnMajor>;
     using BsLayoutFwd = ck_tile::tensor_layout::gemm::ColumnMajor;
     using CLayoutFwd  = ck_tile::tensor_layout::gemm::RowMajor;
     // Backward Data Gemm Layouts
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp
index deb4dcb3db..84f4ebe292 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_data_to_gemm.hpp
@@ -634,22 +634,40 @@ struct TransformConvBwdDataToGemm
         constexpr auto CStride = I1;
 
         // TODO Add support for NumGroupsToMerge > 1
-        return make_naive_tensor_descriptor(
-            make_tuple(N_, Di_, Hi_, Wi_, C_),
-            make_tuple(NStride, DiStride, HiStride, WiStride, CStride),
-            number<VectorSizeC>{},
-            I1);
+        if constexpr(ConvSpec == ConvolutionSpecialization::Filter1x1Stride1Pad0)
+        {
+            return make_naive_tensor_descriptor(make_tuple(N_ * Di_ * Hi_ * Wi_, C_),
+                                                make_tuple(WiStride, CStride),
+                                                number<VectorSizeC>{},
+                                                I1);
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(N_, Di_, Hi_, Wi_, C_),
+                make_tuple(NStride, DiStride, HiStride, WiStride, CStride),
+                number<VectorSizeC>{},
+                I1);
+        }
     }
 
     template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
     CK_TILE_HOST auto make_wei_grid_desc() const
     {
         // GKZYXC
-        return make_naive_tensor_descriptor(
-            make_tuple(K_, Z_, Y_, X_, C_),
-            make_tuple(C_ * X_ * Y_ * Z_, C_ * X_ * Y_, C_ * X_, C_, I1),
-            number<VectorSizeB>{},
-            I1);
+        if constexpr(ConvSpec == ConvolutionSpecialization::Filter1x1Stride1Pad0)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(K_, C_), make_tuple(C_, I1), number<VectorSizeB>{}, I1);
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(K_, Z_, Y_, X_, C_),
+                make_tuple(C_ * X_ * Y_ * Z_, C_ * X_ * Y_, C_ * X_, C_, I1),
+                number<VectorSizeB>{},
+                I1);
+        }
     }
     // TODO: implement ck_tile::tensor_layout::convolution that describe packed/strided dimemsion as
     // properties
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp
index 0b290a474c..9208be4929 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp
@@ -518,10 +518,12 @@ struct TransformConvBwdWeightToGemm
                           NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
             const auto unmerged_padded_desc = transform_tensor_descriptor(
                 padded_desc,
-                make_tuple(make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
-                           make_pass_through_transform(K_),
-                           make_pass_through_transform(X_),
-                           make_pass_through_transform(C_)),
+                make_tuple(
+                    make_xor_transform<decltype(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                                       false>(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                    make_pass_through_transform(K_),
+                    make_pass_through_transform(X_),
+                    make_pass_through_transform(C_)),
                 make_tuple(sequence<0, 3>{}, sequence<1>{}, sequence<2>{}, sequence<4>{}),
                 make_tuple(sequence<0, 3>{}, sequence<1>{}, sequence<2>{}, sequence<4>{}));
             // Merge To M, N
@@ -652,10 +654,12 @@ struct TransformConvBwdWeightToGemm
                           NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
             const auto unmerged_padded_desc = transform_tensor_descriptor(
                 padded_desc,
-                make_tuple(make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
-                           make_pass_through_transform(K_),
-                           make_pass_through_transform(Y_ * X_),
-                           make_pass_through_transform(C_)),
+                make_tuple(
+                    make_xor_transform<decltype(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                                       false>(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                    make_pass_through_transform(K_),
+                    make_pass_through_transform(Y_ * X_),
+                    make_pass_through_transform(C_)),
                 make_tuple(sequence<0, 3>{}, sequence<1>{}, sequence<2>{}, sequence<4>{}),
                 make_tuple(sequence<0, 3>{}, sequence<1>{}, sequence<2>{}, sequence<4>{}));
             // Merge To M, N
@@ -788,10 +792,12 @@ struct TransformConvBwdWeightToGemm
                           NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
             const auto unmerged_padded_desc = transform_tensor_descriptor(
                 padded_desc,
-                make_tuple(make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
-                           make_pass_through_transform(K_),
-                           make_pass_through_transform(Z_ * Y_ * X_),
-                           make_pass_through_transform(C_)),
+                make_tuple(
+                    make_xor_transform<decltype(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                                       false>(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                    make_pass_through_transform(K_),
+                    make_pass_through_transform(Z_ * Y_ * X_),
+                    make_pass_through_transform(C_)),
                 make_tuple(sequence<0, 3>{}, sequence<1>{}, sequence<2>{}, sequence<4>{}),
                 make_tuple(sequence<0, 3>{}, sequence<1>{}, sequence<2>{}, sequence<4>{}));
             // Merge To M, N
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
index 54fec53d56..46e3033ef1 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
@@ -1363,9 +1363,11 @@ struct TransformConvFwdToGemm
                           NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
             const auto unmerged_padded_desc = transform_tensor_descriptor(
                 padded_desc,
-                make_tuple(make_pass_through_transform(NDoHoWo),
-                           make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
-                           make_pass_through_transform(K_)),
+                make_tuple(
+                    make_pass_through_transform(NDoHoWo),
+                    make_xor_transform<decltype(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                                       false>(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                    make_pass_through_transform(K_)),
                 make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}),
                 make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}));
             // Merge To M, N
@@ -1429,9 +1431,11 @@ struct TransformConvFwdToGemm
                           NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
             const auto unmerged_padded_desc = transform_tensor_descriptor(
                 padded_desc,
-                make_tuple(make_pass_through_transform(NDoHoWo),
-                           make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
-                           make_pass_through_transform(K_)),
+                make_tuple(
+                    make_pass_through_transform(NDoHoWo),
+                    make_xor_transform<decltype(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                                       false>(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                    make_pass_through_transform(K_)),
                 make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}),
                 make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}));
             // Merge To M, N
@@ -1496,9 +1500,11 @@ struct TransformConvFwdToGemm
                           NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
             const auto unmerged_padded_desc = transform_tensor_descriptor(
                 padded_desc,
-                make_tuple(make_pass_through_transform(NDoHoWo),
-                           make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
-                           make_pass_through_transform(K_)),
+                make_tuple(
+                    make_pass_through_transform(NDoHoWo),
+                    make_xor_transform<decltype(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                                       false>(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                    make_pass_through_transform(K_)),
                 make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}),
                 make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}));
             // Merge To M, N
diff --git a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
index 07d97ec4ff..717fb4678c 100644
--- a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
+++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
@@ -120,10 +120,6 @@ struct BlockNormReduceSync
 
         constexpr index_t idim_p_lane = NDimP - 1;
 
-        // const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
-        // const auto rs_idx =
-        //     mean_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
-
         constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
         static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size());
 
@@ -303,11 +299,11 @@ struct BlockNormReduceCrossWarpSync
         index_t local_warp_id = warp_id / num_reduce_warps;
         index_t local_smem_os = local_warp_id * num_reduce_warps;
         smem_dtype all_scratch[thread_buf_size * num_reduce_warps];
-        static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
-            static_for<0, num_reduce_warps, 1>{}([&](auto i_1) {
-                all_scratch[i_0 * num_reduce_warps + i_1] =
-                    smem_ptr[i_0 * num_warps + local_smem_os + i_1];
-            });
+        static_ford<sequence<thread_buf_size, num_reduce_warps>>{}([&](auto ii) {
+            constexpr auto i_0 = number<ii[number<0>{}]>{};
+            constexpr auto i_1 = number<ii[number<1>{}]>{};
+            all_scratch[i_0 * num_reduce_warps + i_1] =
+                smem_ptr[i_0 * num_warps + local_smem_os + i_1];
         });
         block_sync_lds(); // TODO: we don't need sync here
 
@@ -360,17 +356,6 @@ struct BlockNormReduceCrossWarpSync
 template <typename BlockShape>
 CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_size)
 {
-#if 0
-    using S                   = BlockShape;
-    index_t LastloopN         = row_size % S::Block_N == 0 ? S::Block_N : row_size % S::Block_N;
-    constexpr index_t NThread = S::WarpPerBlock_N * S::ThreadPerWarp_N;
-    index_t iNLane            = get_thread_id() % NThread;
-    index_t iN0               = LastloopN / (S::Vector_N * S::ThreadPerWarp_N);
-    index_t iN1               = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) / S::Vector_N;
-    index_t N2                = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) % S::Vector_N;
-    index_t iN3               = iNLane < iN1 ? S::Vector_N : iNLane == iN1 ? N2 : 0;
-    return iN0 * S::Vector_N + iN3;
-#endif
     using S_                            = BlockShape;
     constexpr index_t ThreadsPerBlock_N = S_::WarpPerBlock_N * S_::ThreadPerWarp_N;
 
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index ccbdb20793..a14f103eb6 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -140,28 +140,6 @@ struct BlockReduce2d
                                            ReducePacksPerXDim{});
     }
 
-#if 0
-        constexpr auto I0 = number<0>{};
-        constexpr auto I1 = number<1>{};
-        constexpr auto spans = XDistributedTensor_::get_distributed_spans();
-
-        // FIXME: hard coded to reduce 2nd axis
-        sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) {
-            constexpr auto y_dstr_idx = make_tuple(dstr_idx_i0);
-
-            auto y = y_tensor[y_dstr_idx];
-
-            sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) {
-                constexpr auto in_dstr_idx = make_tuple(dstr_idx_i0, dstr_idx_i1);
-                const auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
-
-                y = reduce_func(y, x);
-            });
-
-            y_tensor(y_dstr_idx) = y;
-        });
-#endif
-
     template <typename XDistributedTensor_>
     CK_TILE_DEVICE static auto MakeYBlockTile()
     {
@@ -240,10 +218,6 @@ struct BlockReduce2dSync
 
         constexpr index_t idim_p_lane = NDimP - 1;
 
-        // const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
-        // const auto rs_idx =
-        //     y_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
-
         constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size();
 
         // loop over thread data
@@ -631,17 +605,17 @@ struct BlockReduce2dLinearCrossWarpSync
                                             IndexDataType> all_indices;
 
         // Load data from shared memory
-        static_for<0, thread_buf_size, 1>{}([&](auto i_0) {
-            static_for<0, num_reduce_warps, 1>{}([&](auto i_1) {
-                all_scratch[i_0 * num_reduce_warps + i_1] =
-                    smem_ptr[i_0 * num_warps + local_smem_os + i_1];
+        static_ford<sequence<thread_buf_size, num_reduce_warps>>{}([&](auto ii) {
+            constexpr auto i_0 = number<ii[number<0>{}]>{};
+            constexpr auto i_1 = number<ii[number<1>{}]>{};
+            all_scratch[i_0 * num_reduce_warps + i_1] =
+                smem_ptr[i_0 * num_warps + local_smem_os + i_1];
 
-                if constexpr(kProcessIndex)
-                {
-                    all_indices[i_0 * num_reduce_warps + i_1] =
-                        smem_indices[i_0 * num_warps + local_smem_os + i_1];
-                }
-            });
+            if constexpr(kProcessIndex)
+            {
+                all_indices[i_0 * num_reduce_warps + i_1] =
+                    smem_indices[i_0 * num_warps + local_smem_os + i_1];
+            }
         });
         block_sync_lds(); // TODO: we don't need sync here
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
index f46ebbacf7..348216129f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
@@ -52,66 +52,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmSt
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-#if 0
-        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
-                     is_same_v<CDataType, float>)
-        {
-            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
-                         is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(op_ptrs);
-            }
-        }
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
-                          is_same_v<CDataType, half_t>)
-        {
-            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
-                         is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_irregular_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v2_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v2_irregular_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_irregular__instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_irregular_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v2_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v2_irregular_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs);
-            }
-        }
-#endif
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                      is_same_v<CDataType, half_t>)
         {
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 4c71a4b9e6..db72ab11c0 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -381,7 +381,7 @@ ENDFOREACH()
 
 
 
-if(CK_DEVICE_OTHER_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
+if(CK_DEVICE_OTHER_INSTANCES AND (NOT MIOPEN_REQ_LIBS_ONLY OR HIPTENSOR_REQ_LIBS_ONLY))
         add_library(device_other_operations ${CK_DEVICE_OTHER_INSTANCES})
         add_library(composablekernels::device_other_operations ALIAS device_other_operations)
         set_target_properties(device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -485,7 +485,7 @@ if(CK_DEVICE_MHA_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LI
             )
         endif()
 endif()
-if(CK_DEVICE_CONTRACTION_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
+if(CK_DEVICE_CONTRACTION_INSTANCES AND (NOT MIOPEN_REQ_LIBS_ONLY OR HIPTENSOR_REQ_LIBS_ONLY))
         add_library(device_contraction_operations ${CK_DEVICE_CONTRACTION_INSTANCES})
         add_library(composablekernels::device_contraction_operations ALIAS device_contraction_operations)
         target_compile_features(device_contraction_operations PUBLIC)
@@ -507,7 +507,7 @@ if(CK_DEVICE_CONTRACTION_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
             DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
         )
 endif()
-if(CK_DEVICE_REDUCTION_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
+if(CK_DEVICE_REDUCTION_INSTANCES AND (NOT MIOPEN_REQ_LIBS_ONLY OR HIPTENSOR_REQ_LIBS_ONLY))
         add_library(device_reduction_operations ${CK_DEVICE_REDUCTION_INSTANCES})
         add_library(composablekernels::device_reduction_operations ALIAS device_reduction_operations)
         target_compile_features(device_reduction_operations PUBLIC)
diff --git a/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt
new file mode 100644
index 0000000000..cd0d93c5e9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt
@@ -0,0 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# This directory contains only shared header files (contraction_instance_common.hpp).
+# There are no source files to compile here — the header is included by the
+# contraction_bilinear/ and contraction_scale/ instance directories.
diff --git a/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp b/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp
new file mode 100644
index 0000000000..e9f838107e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp
@@ -0,0 +1,77 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+// Macro to generate a contraction device operation instance definition and its
+// registration function. Each invocation produces one using-alias and one
+// add_device_* function inside ck::tensor_operation::device::instance.
+//
+// Parameters:
+//   INST_TPL     — instance template (e.g. device_contraction_kk_instance,
+//                  device_contraction_f64_kk_instance)
+//   OP_NAME      — lowercase operation name for identifier construction
+//                  (bilinear or scale)
+//   CDE_OP       — C++ element-wise operation type for template argument
+//                  (Bilinear or Scale)
+//   NDIM_VAL     — number of dimensions (2 or 6)
+//   NAME_SUFFIX  — data-type and layout suffix for the generated names
+//                  (e.g. f32_f32_f32_f32_kknn, bf16_bf16_bf16_bf16_compute_f32_knnn)
+//   ADATA        — ADataType
+//   BDATA        — BDataType
+//   ACC          — AccDataType
+//   CSHUFFLE     — CShuffleDataType
+//   DS_TUPLE     — DsDataType (e.g. F32_Tuple, Empty_Tuple)
+//   EDATA        — EDataType
+//   COMPUTE      — ComputeDataType
+//
+// Example — bilinear, F32, kk layout, 2D:
+//
+//   CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+//       bilinear, Bilinear, 2, f32_f32_f32_f32_kknn,
+//       F32, F32, F32, F32, F32_Tuple, F32, F32)
+//
+// Expands to:
+//   using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = ...;
+//   void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(...)
+//   { ... }
+//
+// clang-format off
+#define CK_CONTRACTION_INSTANCE(INST_TPL, OP_NAME, CDE_OP, NDIM_VAL,                             \
+    NAME_SUFFIX, ADATA, BDATA, ACC, CSHUFFLE, DS_TUPLE, EDATA, COMPUTE)                           \
+                                                                                                   \
+namespace ck {                                                                                     \
+namespace tensor_operation {                                                                       \
+namespace device {                                                                                 \
+namespace instance {                                                                               \
+                                                                                                   \
+using device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance = \
+    INST_TPL<ADATA, BDATA, ACC, CSHUFFLE, DS_TUPLE, EDATA, COMPUTE,                               \
+             PassThrough, PassThrough, CDE_OP, NDIM_VAL>;                                         \
+                                                                                                   \
+void add_device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance( \
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<NDIM_VAL, NDIM_VAL, NDIM_VAL,          \
+        ADATA, BDATA, DS_TUPLE, EDATA, PassThrough, PassThrough, CDE_OP, COMPUTE>>>& instances)   \
+{                                                                                                  \
+    add_device_operation_instances(instances,                                                       \
+        device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance{}); \
+}                                                                                                  \
+                                                                                                   \
+} /* namespace instance */                                                                         \
+} /* namespace device */                                                                           \
+} /* namespace tensor_operation */                                                                 \
+} /* namespace ck */
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
index c8f6053c44..1a4ce88a39 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance =
-    device_contraction_kk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_kknn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
index fb1002f1aa..cdfcab69af 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance =
-    device_contraction_kn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_knnn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
index 5918beb9ad..b1ca1603b4 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance =
-    device_contraction_mk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mknn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
index fccd91e5be..bd7f73d2ed 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance =
-    device_contraction_mn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mnnn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp
index ce57ee2d07..964d2a0690 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance =
-    device_contraction_kk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_kknn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp
index e1e5dbb434..ac8ac661e3 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance =
-    device_contraction_kn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_knnn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp
index db98406390..281673f6a8 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance =
-    device_contraction_mk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mknn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp
index 5c7032e854..3ac1cef7be 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance =
-    device_contraction_mn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mnnn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
index 89cb35495b..5b410c24a0 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance =
-    device_contraction_kk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_kknn,
+    F16, F16, F32, F16, F16_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
index c25ebfb598..9982149b2e 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance =
-    device_contraction_kn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_knnn,
+    F16, F16, F32, F16, F16_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
index 9815d2f4e3..0b6f0a8589 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance =
-    device_contraction_mk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mknn,
+    F16, F16, F32, F16, F16_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
index c1735b1fe1..a2092c8c5c 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance =
-    device_contraction_mn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mnnn,
+    F16, F16, F32, F16, F16_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp
index a0c8376980..188a674c3f 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance =
-    device_contraction_kk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 2, f16_f16_f16_f16_kknn,
+    F16, F16, F32, F16, F16_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp
index 0798f7a9b6..e083e27460 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance =
-    device_contraction_kn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 2, f16_f16_f16_f16_knnn,
+    F16, F16, F32, F16, F16_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp
index 7da8371482..8986de8f82 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance =
-    device_contraction_mk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 2, f16_f16_f16_f16_mknn,
+    F16, F16, F32, F16, F16_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp
index 49267e0867..7a80a9e6f0 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance =
-    device_contraction_mn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 2, f16_f16_f16_f16_mnnn,
+    F16, F16, F32, F16, F16_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
index 008d5720af..ddb619c3f8 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_kknn,
+    F32, F32, F32, F32, F32_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
index 9b927385ef..e2abf1c057 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_knnn,
+    F32, F32, F32, F32, F32_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
index a398194f64..bc1965c900 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mknn,
+    F32, F32, F32, F32, F32_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
index 3726f97709..4390179324 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mnnn,
+    F32, F32, F32, F32, F32_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
index 41fa523b5f..eae059b621 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_kknn,
+    F32, F32, F32, F32, F32_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
index 898c5a79cc..b3a72e5f99 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_knnn,
+    F32, F32, F32, F32, F32_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
index 64db3364a3..627489886d 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mknn,
+    F32, F32, F32, F32, F32_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
index ad548f38e7..8442ea8fae 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mnnn,
+    F32, F32, F32, F32, F32_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
index 3e36bfd30b..9344bb06de 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_kknn,
+    F32, F32, F32, F32, F32_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
index b67121316b..72bec728d9 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_knnn,
+    F32, F32, F32, F32, F32_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
index 94228aa307..7e4a69f634 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_mknn,
+    F32, F32, F32, F32, F32_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
index 28184344c3..9516290b23 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 2, f32_f32_f32_f32_mnnn,
+    F32, F32, F32, F32, F32_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
index f2d107c37d..2f7ddf0a38 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance =
-    device_contraction_f64_kk_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
+    bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_kknn,
+    F64, F64, F32, F64, F64_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
index dcf8c05eda..074035870f 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance =
-    device_contraction_f64_kn_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
+    bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_knnn,
+    F64, F64, F32, F64, F64_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
index fe2e1108e9..70e4a0ca80 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance =
-    device_contraction_f64_mk_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
+    bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mknn,
+    F64, F64, F32, F64, F64_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
index 420a1f07eb..03d36ce10c 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance =
-    device_contraction_f64_mn_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
+    bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mnnn,
+    F64, F64, F32, F64, F64_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
index 1c5917cbc6..a3e48e8fe0 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance =
-    device_contraction_f64_kk_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
+    bilinear, Bilinear, 2, f64_f64_f64_f64_kknn,
+    F64, F64, F64, F64, F64_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
index 6b87fcf1d8..b6391d36ed 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance =
-    device_contraction_f64_kn_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
+    bilinear, Bilinear, 2, f64_f64_f64_f64_knnn,
+    F64, F64, F64, F64, F64_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
index 03469cd96b..3a96d9c8a4 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance =
-    device_contraction_f64_mk_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
+    bilinear, Bilinear, 2, f64_f64_f64_f64_mknn,
+    F64, F64, F64, F64, F64_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
index 5171a38dec..fc4f651f75 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance =
-    device_contraction_f64_mn_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       2>;
-
-void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
+    bilinear, Bilinear, 2, f64_f64_f64_f64_mnnn,
+    F64, F64, F64, F64, F64_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
index 961b78427f..26e9a1801b 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance =
-    device_contraction_kk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_kknn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
index 5cd869249d..419b1ce339 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance =
-    device_contraction_kn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_knnn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
index aa8ad904a5..9b6490cfda 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance =
-    device_contraction_mk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mknn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
index 80b4de6060..931820ecb8 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance =
-    device_contraction_mn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mnnn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp
index 77fae91ffe..35b76bb568 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance =
-    device_contraction_kk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_kknn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp
index 9b8cacc5e1..7a558ca4a8 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance =
-    device_contraction_kn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_knnn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp
index 50a7645256..020ac2ca39 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance =
-    device_contraction_mk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mknn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp
index 78aa99fa6e..c213203927 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance =
-    device_contraction_mn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   BF16_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mnnn,
+    BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
index 2342b0db67..0896074b15 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance =
-    device_contraction_kk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_kknn,
+    F16, F16, F32, F16, F16_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
index 130d56c5ca..b9b7e22544 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance =
-    device_contraction_kn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_knnn,
+    F16, F16, F32, F16, F16_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
index 90222accc1..86affeec00 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance =
-    device_contraction_mk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mknn,
+    F16, F16, F32, F16, F16_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
index 9b731a95cf..2315f61168 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance =
-    device_contraction_mn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mnnn,
+    F16, F16, F32, F16, F16_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp
index e738e54f06..dae7e5780a 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp
@@ -1,60 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance =
-    device_contraction_kk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: before add, size=%zu\n", instances.size());
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{});
-    printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: after add, size=%zu\n", instances.size());
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 6, f16_f16_f16_f16_kknn,
+    F16, F16, F32, F16, F16_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp
index 4bc5b1684a..319f5a87de 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance =
-    device_contraction_kn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 6, f16_f16_f16_f16_knnn,
+    F16, F16, F32, F16, F16_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp
index e320fbe11a..03739391cd 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance =
-    device_contraction_mk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 6, f16_f16_f16_f16_mknn,
+    F16, F16, F32, F16, F16_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp
index bbb90a6af4..d40fcae6ff 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance =
-    device_contraction_mn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   F16_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           F16_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 6, f16_f16_f16_f16_mnnn,
+    F16, F16, F32, F16, F16_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
index b95aa0d5ba..36e8a19263 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_kknn,
+    F32, F32, F32, F32, F32_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
index e2f62c2342..8b3d2c6420 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_knnn,
+    F32, F32, F32, F32, F32_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
index 80b6b6ecf8..7c6a8b8d83 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mknn,
+    F32, F32, F32, F32, F32_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
index 181ad86e1b..8b08570f6c 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mnnn,
+    F32, F32, F32, F32, F32_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
index 514da56a0f..881436f505 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_kknn,
+    F32, F32, F32, F32, F32_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
index 61dda90cbc..6b2d7b14c5 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_knnn,
+    F32, F32, F32, F32, F32_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
index 301bde04b8..bb91b6879b 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mknn,
+    F32, F32, F32, F32, F32_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
index 09dbdff021..d35107af67 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mnnn,
+    F32, F32, F32, F32, F32_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
index fe7b520219..f56045888a 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_kknn,
+    F32, F32, F32, F32, F32_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
index c99a1439e1..5a591fb479 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_knnn,
+    F32, F32, F32, F32, F32_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
index 7ae0833b19..42010cb957 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_mknn,
+    F32, F32, F32, F32, F32_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
index f0cd251985..ca015c306d 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   F32_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Bilinear,
-                                   6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           F32_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    bilinear, Bilinear, 6, f32_f32_f32_f32_mnnn,
+    F32, F32, F32, F32, F32_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
index a14b00a7f2..3254d2a5f1 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance =
-    device_contraction_f64_kk_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
+    bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_kknn,
+    F64, F64, F32, F64, F64_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
index e719402251..a2831f0760 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance =
-    device_contraction_f64_kn_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
+    bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_knnn,
+    F64, F64, F32, F64, F64_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
index d093671e25..cede3aa1a4 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance =
-    device_contraction_f64_mk_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
+    bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mknn,
+    F64, F64, F32, F64, F64_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
index 3e0ac565e2..bbee01fa58 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance =
-    device_contraction_f64_mn_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
+    bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mnnn,
+    F64, F64, F32, F64, F64_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
index c4c8cd13d5..c6fc9eecf3 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance =
-    device_contraction_f64_kk_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
+    bilinear, Bilinear, 6, f64_f64_f64_f64_kknn,
+    F64, F64, F64, F64, F64_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
index 7e056c4824..4c0dabed1a 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance =
-    device_contraction_f64_kn_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
+    bilinear, Bilinear, 6, f64_f64_f64_f64_knnn,
+    F64, F64, F64, F64, F64_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
index dd11af63b4..7154fa8801 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance =
-    device_contraction_f64_mk_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
+    bilinear, Bilinear, 6, f64_f64_f64_f64_mknn,
+    F64, F64, F64, F64, F64_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
index 990e862e77..bd24c620e3 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance =
-    device_contraction_f64_mn_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       F64_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Bilinear,
-                                       6>;
-
-void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           F64_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Bilinear,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
+    bilinear, Bilinear, 6, f64_f64_f64_f64_mnnn,
+    F64, F64, F64, F64, F64_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
index a3acedbcc4..a0ff8391d2 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance =
-    device_contraction_kk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 2, bf16_bf16_bf16_compute_f32_kkn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
index c5c365ec26..bf5a255afd 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance =
-    device_contraction_kn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 2, bf16_bf16_bf16_compute_f32_knn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
index 58ab346942..8c26b797a7 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance =
-    device_contraction_mk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 2, bf16_bf16_bf16_compute_f32_mkn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
index 8c9f6fc57b..c93b43da7b 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance =
-    device_contraction_mn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 2, bf16_bf16_bf16_compute_f32_mnn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp
index c85f8cc998..9d32d0eb45 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance =
-    device_contraction_kk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 2, bf16_bf16_bf16_kkn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp
index d4a25d40cb..8474e996c2 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance =
-    device_contraction_kn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 2, bf16_bf16_bf16_knn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp
index 7be8a0a694..6c8c7ac837 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance =
-    device_contraction_mk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 2, bf16_bf16_bf16_mkn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp
index b2a4c020e6..e971273a2f 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance =
-    device_contraction_mn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 2, bf16_bf16_bf16_mnn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
index 9a9d3e16fb..8026a5f3b9 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance =
-    device_contraction_kk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 2, f16_f16_f16_compute_f32_kkn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
index d158d5eb99..6974749546 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance =
-    device_contraction_kn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 2, f16_f16_f16_compute_f32_knn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
index a263d0b8ca..fb80ab9df1 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance =
-    device_contraction_mk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 2, f16_f16_f16_compute_f32_mkn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
index eb9fa3714e..87f337c67f 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance =
-    device_contraction_mn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 2, f16_f16_f16_compute_f32_mnn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp
index 52042dd045..e8de33728b 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance =
-    device_contraction_kk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 2, f16_f16_f16_kkn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp
index 2b6aed8ed4..e87816b00f 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance =
-    device_contraction_kn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 2, f16_f16_f16_knn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp
index 07cbbf87c6..2e13b536f2 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance =
-    device_contraction_mk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 2, f16_f16_f16_mkn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp
index 2cc4bfb718..eccce81df9 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance =
-    device_contraction_mn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 2, f16_f16_f16_mnn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
index 50fe1a696f..6464ffeddc 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 2, f32_f32_f32_compute_bf16_kkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
index 6aab79f312..26bf607559 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 2, f32_f32_f32_compute_bf16_knn,
+    F32, F32, F32, F32, Empty_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
index e6f24424ab..e236ad71f4 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 2, f32_f32_f32_compute_bf16_mkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
index 60b760bfce..3ccd1820e0 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 2, f32_f32_f32_compute_bf16_mnn,
+    F32, F32, F32, F32, Empty_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
index 19992c96fd..f60ef81681 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 2, f32_f32_f32_compute_f16_kkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
index a13e315e38..da0ffaf8f0 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 2, f32_f32_f32_compute_f16_knn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
index 3b4aaa7a5b..a1567d9c82 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 2, f32_f32_f32_compute_f16_mkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
index 48e190574f..098602f203 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 2, f32_f32_f32_compute_f16_mnn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
index 1b8bceb65d..483b4eb869 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 2, f32_f32_f32_kkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
index a09ebae1dd..71b17712b3 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 2, f32_f32_f32_knn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
index 4172958f2a..91b6b1d927 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 2, f32_f32_f32_mkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
index c8c9ce4348..cbba0786e2 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 2, f32_f32_f32_mnn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
index bb44557ba8..dcd7cf50c4 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance =
-    device_contraction_f64_kk_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
+    scale, Scale, 2, f64_f64_f64_compute_f32_kkn,
+    F64, F64, F32, F64, Empty_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
index 91c96bd679..13ac1b4cbb 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance =
-    device_contraction_f64_kn_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
+    scale, Scale, 2, f64_f64_f64_compute_f32_knn,
+    F64, F64, F32, F64, Empty_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
index 0fe142fc59..e012e157a7 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance =
-    device_contraction_f64_mk_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
+    scale, Scale, 2, f64_f64_f64_compute_f32_mkn,
+    F64, F64, F32, F64, Empty_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
index 28d337d246..5bda236856 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance =
-    device_contraction_f64_mn_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
+    scale, Scale, 2, f64_f64_f64_compute_f32_mnn,
+    F64, F64, F32, F64, Empty_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
index 39e29cd3e8..8ab00c937c 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance =
-    device_contraction_f64_kk_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
+    scale, Scale, 2, f64_f64_f64_kkn,
+    F64, F64, F64, F64, Empty_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
index ef4dd284e5..fb33d7d761 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance =
-    device_contraction_f64_kn_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
+    scale, Scale, 2, f64_f64_f64_knn,
+    F64, F64, F64, F64, Empty_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
index 78effae8e2..571cea261e 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance =
-    device_contraction_f64_mk_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
+    scale, Scale, 2, f64_f64_f64_mkn,
+    F64, F64, F64, F64, Empty_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
index 465a80b1b0..9847c021d5 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance =
-    device_contraction_f64_mn_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       2>;
-
-void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                           2,
-                                                           2,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
+    scale, Scale, 2, f64_f64_f64_mnn,
+    F64, F64, F64, F64, Empty_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
index a472f793e4..134fca4936 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance =
-    device_contraction_kk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 6, bf16_bf16_bf16_compute_f32_kkn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
index c4bddd6c6e..062f8468f7 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance =
-    device_contraction_kn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 6, bf16_bf16_bf16_compute_f32_knn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
index 3a1c9c3fb9..c6b7784f27 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance =
-    device_contraction_mk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 6, bf16_bf16_bf16_compute_f32_mkn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
index d23c005191..30f483036a 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance =
-    device_contraction_mn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 6, bf16_bf16_bf16_compute_f32_mnn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp
index 9244f6a132..9118dba4f1 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance =
-    device_contraction_kk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 6, bf16_bf16_bf16_kkn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp
index 99e80e0e28..713eff33cb 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance =
-    device_contraction_kn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 6, bf16_bf16_bf16_knn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp
index 77ca8c0d16..1b78e11f70 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance =
-    device_contraction_mk_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 6, bf16_bf16_bf16_mkn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp
index 564fe537bb..2a70c27f20 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance =
-    device_contraction_mn_instance<BF16,
-                                   BF16,
-                                   F32,
-                                   BF16,
-                                   Empty_Tuple,
-                                   BF16,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           BF16,
-                                                           BF16,
-                                                           Empty_Tuple,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 6, bf16_bf16_bf16_mnn,
+    BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
index 69f074caf0..80bc1cbe72 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance =
-    device_contraction_kk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 6, f16_f16_f16_compute_f32_kkn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
index dbad11727c..5564fcb64f 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance =
-    device_contraction_kn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 6, f16_f16_f16_compute_f32_knn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
index a53e7801ea..19c73e48b8 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance =
-    device_contraction_mk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 6, f16_f16_f16_compute_f32_mkn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
index 977497d387..1acb62c960 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance =
-    device_contraction_mn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 6, f16_f16_f16_compute_f32_mnn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp
index dfc187562a..28d2d84510 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance =
-    device_contraction_kk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 6, f16_f16_f16_kkn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp
index 50d951a99c..ba247621ff 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance =
-    device_contraction_kn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 6, f16_f16_f16_knn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp
index 460c5c4b49..32d601c9b7 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance =
-    device_contraction_mk_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 6, f16_f16_f16_mkn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp
index bee17f3386..fb66208b93 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance =
-    device_contraction_mn_instance<F16,
-                                   F16,
-                                   F32,
-                                   F16,
-                                   Empty_Tuple,
-                                   F16,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F16,
-                                                           F16,
-                                                           Empty_Tuple,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 6, f16_f16_f16_mnn,
+    F16, F16, F32, F16, Empty_Tuple, F16, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
index 5f737132af..c78f64bfca 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 6, f32_f32_f32_compute_bf16_kkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
index 1dbebe89f7..fde6062baa 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 6, f32_f32_f32_compute_bf16_knn,
+    F32, F32, F32, F32, Empty_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
index 4c609db46a..7d3ae3348e 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 6, f32_f32_f32_compute_bf16_mkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
index 9005335eaf..899ba7aac5 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   BF16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           BF16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 6, f32_f32_f32_compute_bf16_mnn,
+    F32, F32, F32, F32, Empty_Tuple, F32, BF16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
index 4623b2e5d8..afc0c0a588 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 6, f32_f32_f32_compute_f16_kkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
index 952ad237a8..7d084a8b45 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 6, f32_f32_f32_compute_f16_knn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
index 8273c319b8..821bc2798f 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 6, f32_f32_f32_compute_f16_mkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
index cf22f7a729..3fe62bb117 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F16,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F16>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 6, f32_f32_f32_compute_f16_mnn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F16)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
index a4659d4d90..a294533556 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance =
-    device_contraction_kk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
+    scale, Scale, 6, f32_f32_f32_kkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
index 72adf0f03d..fa38bc2ef8 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance =
-    device_contraction_kn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
+    scale, Scale, 6, f32_f32_f32_knn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
index d70c2bb4c5..5752bc169a 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance =
-    device_contraction_mk_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
+    scale, Scale, 6, f32_f32_f32_mkn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
index 7fa3458ab0..1cae73eb8a 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance =
-    device_contraction_mn_instance<F32,
-                                   F32,
-                                   F32,
-                                   F32,
-                                   Empty_Tuple,
-                                   F32,
-                                   F32,
-                                   PassThrough,
-                                   PassThrough,
-                                   Scale,
-                                   6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F32,
-                                                           F32,
-                                                           Empty_Tuple,
-                                                           F32,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
+    scale, Scale, 6, f32_f32_f32_mnn,
+    F32, F32, F32, F32, Empty_Tuple, F32, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
index 877545e338..1f171a1413 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance =
-    device_contraction_f64_kk_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
+    scale, Scale, 6, f64_f64_f64_compute_f32_kkn,
+    F64, F64, F32, F64, Empty_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
index df51431b23..66a8eae427 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance =
-    device_contraction_f64_kn_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
+    scale, Scale, 6, f64_f64_f64_compute_f32_knn,
+    F64, F64, F32, F64, Empty_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
index 3bbdf84865..9c5e9fd1bb 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance =
-    device_contraction_f64_mk_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
+    scale, Scale, 6, f64_f64_f64_compute_f32_mkn,
+    F64, F64, F32, F64, Empty_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
index 127c47c5a3..579e955973 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
@@ -1,58 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance =
-    device_contraction_f64_mn_instance<F64,
-                                       F64,
-                                       F32,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F32,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F32>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
+    scale, Scale, 6, f64_f64_f64_compute_f32_mnn,
+    F64, F64, F32, F64, Empty_Tuple, F64, F32)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
index f05a685d17..c3357a6f91 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance =
-    device_contraction_f64_kk_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
+    scale, Scale, 6, f64_f64_f64_kkn,
+    F64, F64, F64, F64, Empty_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
index 34bc800fcf..447db7fab4 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// k/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance =
-    device_contraction_f64_kn_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
+    scale, Scale, 6, f64_f64_f64_knn,
+    F64, F64, F64, F64, Empty_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
index 180d1b5273..059689ff5e 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/k/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance =
-    device_contraction_f64_mk_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
+    scale, Scale, 6, f64_f64_f64_mkn,
+    F64, F64, F64, F64, Empty_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
index bb6f5c6685..393b7ac6f3 100644
--- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
@@ -1,57 +1,12 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
-// setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#include "../../contraction/contraction_instance_common.hpp"
 
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
-// m/n/n/n are the fast changing dimension for A/B/D/E
-using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance =
-    device_contraction_f64_mn_instance<F64,
-                                       F64,
-                                       F64,
-                                       F64,
-                                       Empty_Tuple,
-                                       F64,
-                                       F64,
-                                       PassThrough,
-                                       PassThrough,
-                                       Scale,
-                                       6>;
-
-void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance(
-    std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
-                                                           6,
-                                                           6,
-                                                           F64,
-                                                           F64,
-                                                           Empty_Tuple,
-                                                           F64,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           Scale,
-                                                           F64>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+// Instantiate contraction device operation and register via add_device_* function.
+// See contraction_instance_common.hpp for macro definition and parameter documentation.
+// clang-format off
+CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
+    scale, Scale, 6, f64_f64_f64_mnn,
+    F64, F64, F64, F64, Empty_Tuple, F64, F64)
+// clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
index 3d391ae931..a8d69afb9a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
@@ -33,12 +33,6 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
-#if 0
-template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_b_scale_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple<
-
-#endif
-
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
     // clang-format off
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instance.cpp
deleted file mode 100644
index 447681a294..0000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instance.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
-                                                          Col,
-                                                          Tuple<Row, Col>,
-                                                          Row,
-                                                          F8,
-                                                          F8,
-                                                          Tuple<F32, F32>,
-                                                          F16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          MultiplyMultiply>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_instances<GemmDefault>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
deleted file mode 100644
index 7a377210c2..0000000000
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_f16/device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
-                                                          Col,
-                                                          Tuple<Row, Col>,
-                                                          Row,
-                                                          F8,
-                                                          F8,
-                                                          Tuple<F32, F32>,
-                                                          F16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          MultiplyMultiply>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp
index 600154a9fd..919236deee 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -26,9 +26,6 @@ using S = ck::Sequence<Is...>;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-// static constexpr auto GemmMNPadding =
-// ck::tensor_operation::device::GemmSpecialization::MNPadding;
 using device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_generic_instances = std::tuple<
     // clang-format off
         //##################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
index 8ba6c485cb..99e809f0ec 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
@@ -33,25 +33,6 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
-#if 0
-template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
-        // Compute friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
-#endif
-
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances = std::tuple<
     // clang-format off
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
index 088378b918..c52b9723a9 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
@@ -33,25 +33,6 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
-#if 0
-template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
-        // Compute friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
-#endif
-
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances =
     std::tuple<
diff --git a/profiler/include/profiler/grouped_convolution_backward_data_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_backward_data_tile_algs.hpp
new file mode 100644
index 0000000000..2fa2019b07
--- /dev/null
+++ b/profiler/include/profiler/grouped_convolution_backward_data_tile_algs.hpp
@@ -0,0 +1,204 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <tuple>
+
+#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp"
+#include "grouped_convolution_signatures.hpp"
+#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp"
+
+#include "ck_tile/builder/testing/filter_extent.hpp"
+#include "ck_tile/builder/testing/conv/ck_tile.hpp"
+#include "ck_tile/builder/testing/conv/reference.hpp"
+#include "ck_tile/builder/conv_builder.hpp"
+#include "tile_profiler_utils.hpp"
+
+namespace ck_tile::builder::profiling {
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+
+#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_fp32.inc"
+#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_fp32.inc"
+#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_bf16.inc"
+#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_fp16.inc"
+#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_bf16.inc"
+#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_fp16.inc"
+
+template <auto SIGNATURE>
+void run_cpu_validation(const ckt::Args<SIGNATURE>& args,
+                        const ckt::Outputs<SIGNATURE>& outputs,
+                        const ckt::Outputs<SIGNATURE>& reference)
+{
+    using DataType =
+        std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP32,
+                           float,
+                           std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP16,
+                                              ck_tile::half_t,
+                                              ck_tile::bfloat16_t>>;
+    const auto conv_param = args.to_ck_tile_conv_param();
+
+    const std::size_t input_bytes_num = conv_param.template GetInputByte<DataType>();
+    std::vector<DataType> in(input_bytes_num / sizeof(DataType));
+    std::vector<DataType> ref(input_bytes_num / sizeof(DataType));
+    HIP_CHECK_ERROR(
+        hipMemcpy(&ref.data()[0], reference.input, input_bytes_num, hipMemcpyDeviceToHost));
+    HIP_CHECK_ERROR(
+        hipMemcpy(&in.data()[0], outputs.input, input_bytes_num, hipMemcpyDeviceToHost));
+    ck_tile::check_err(in, ref, "\tError: Incorrect results!");
+}
+
+/// @brief `run_grouped_conv_backward_data_tile_algs()` run all grouped conv fwd instances.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+///
+/// @see run_grouped_conv_backward_data_tile_algs()
+template <auto SIGNATURE>
+std::tuple<bool, float, std::string, int, int>
+run_grouped_conv_backward_data_tile_algs(const ckt::Args<SIGNATURE>& args,
+                                         const std::string& split_k,
+                                         const index_t instance_index,
+                                         const ckt::Inputs<SIGNATURE>& inputs,
+                                         const ckt::Outputs<SIGNATURE>& outputs,
+                                         const ck_tile::stream_config& s_conf)
+{
+    float best_avg_time = std::numeric_limits<float>::max();
+    std::string best_op_name, op_name;
+    int best_split_k                = 0;
+    ck::index_t best_instance_index = -1;
+    bool is_supported               = false;
+    float avg_time;
+    bool all_instances_valid = true;
+
+    using DataType =
+        std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP32,
+                           float,
+                           std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP16,
+                                              ck_tile::half_t,
+                                              ck_tile::bfloat16_t>>;
+
+    auto reference = ckt::alloc_outputs(args);
+    using ReferenceInstance =
+        typename ckb::ConvBuilder<SIGNATURE, ckt::ConvAlgorithm_Reference{}>::Instance;
+    auto ref_conv   = ReferenceInstance{};
+    auto ref_result = ckt::run(ref_conv, args, inputs, reference.get());
+
+    const auto conv_param = args.to_ck_tile_conv_param();
+
+    // Get max possible value in the output
+    const std::size_t input_bytes_num = conv_param.template GetInputByte<DataType>();
+    std::vector<DataType> ref(input_bytes_num / sizeof(DataType));
+    HIP_CHECK_ERROR(
+        hipMemcpy(&ref.data()[0], reference.get().input, input_bytes_num, hipMemcpyDeviceToHost));
+    const float max_accumulated_value = *std::max_element(ref.begin(), ref.end());
+
+    const index_t num_accums = conv_param.K_;
+
+    // BWD data doesn't support split-K autodeduce value -1
+    auto split_k_values = get_split_k_values(split_k);
+    split_k_values.erase(std::remove(split_k_values.begin(), split_k_values.end(), -1),
+                         split_k_values.end());
+
+    index_t num_kernel = 0;
+    auto run_alg       = [&](auto&& run_alg_func) {
+        num_kernel++;
+        // Skip if a specific instance was requested and this isn't it
+        const bool running_specific_instance = (instance_index != -1);
+        const bool current_is_target         = (num_kernel - 1 == instance_index);
+        if(running_specific_instance && !current_is_target)
+        {
+            return;
+        }
+
+        for(auto& k_batch : split_k_values)
+        {
+            ckt::Args<SIGNATURE> args_k_batch = args;
+            args_k_batch.k_batch              = k_batch;
+            std::tie(is_supported, avg_time, op_name) =
+                run_alg_func(args_k_batch, inputs, outputs, s_conf);
+            if(is_supported)
+            {
+                ckt::ValidationReport report;
+                auto&& [rtol, atol] =
+                    get_rtol_atol<SIGNATURE>(num_accums, k_batch, max_accumulated_value);
+                ckt::Outputs<SIGNATURE>::reflect(
+                    args_k_batch,
+                    [&](std::string_view name,
+                        const auto& desc,
+                        void* ckt::Outputs<SIGNATURE>::*ptr) {
+                        report.check(name, desc, outputs.*ptr, reference.get().*ptr, rtol, atol);
+                    });
+
+                const bool valid = report.get_errors().empty();
+                if(valid)
+                {
+                    if(avg_time < best_avg_time)
+                    {
+                        best_instance_index = num_kernel - 1;
+                    }
+                    best_avg_time = std::min(best_avg_time, avg_time);
+                    best_op_name  = best_avg_time < avg_time ? best_op_name : op_name;
+                    best_split_k  = best_avg_time < avg_time ? best_split_k : k_batch;
+                    std::cout << "[Valid] Perf: " << std::setw(10) << avg_time << " ms," << " "
+                              << op_name << " (instance " << num_kernel - 1 << "), SplitK "
+                              << k_batch << std::endl;
+                }
+                else
+                {
+                    std::cout << "[Error] " << op_name << ", SplitK " << k_batch << std::endl;
+                    for(const auto& error : report.get_errors())
+                    {
+                        std::cout << "\tNumber of incorrect values: " << error.wrong_elements
+                                  << " Is all zero:" << error.is_all_zero()
+                                  << " max err: " << error.max_error << std::endl;
+                        // Check with cpu verification to get a values
+                        run_cpu_validation<SIGNATURE>(args_k_batch, outputs, reference.get());
+                    }
+                    all_instances_valid = false;
+                }
+            }
+            else
+            {
+                std::cout << "[Not supported] " << op_name << ", SplitK " << k_batch << std::endl;
+            }
+        }
+    };
+
+    if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP16_BWD_DATA)
+    {
+#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_fp16_calls.inc"
+    }
+    else if constexpr(SIGNATURE == SIGNATURE_NHWGC_BF16_BWD_DATA)
+    {
+#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_bf16_calls.inc"
+    }
+    else if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP32_BWD_DATA)
+    {
+#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_nhwgc_fp32_calls.inc"
+    }
+    else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP16_BWD_DATA)
+    {
+#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_fp16_calls.inc"
+    }
+    else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_BF16_BWD_DATA)
+    {
+#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_bf16_calls.inc"
+    }
+    else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP32_BWD_DATA)
+    {
+#include "../../experimental/grouped_convolution_tile_instances/instances/backward_data/grouped_convolution_backward_data_tile_ndhwgc_fp32_calls.inc"
+    }
+    else
+    {
+        std::cout << "Signature not supported" << std::endl;
+        return std::make_tuple(
+            false, best_avg_time, best_op_name, best_split_k, best_instance_index);
+    }
+    return std::make_tuple(
+        all_instances_valid, best_avg_time, best_op_name, best_split_k, best_instance_index);
+}
+
+} // namespace ck_tile::builder::profiling
diff --git a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
index f69c5bb7a1..e79fc44e8d 100644
--- a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
+++ b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
@@ -15,6 +15,7 @@
 #include "ck_tile/builder/testing/conv/ck_tile.hpp"
 #include "ck_tile/builder/testing/conv/reference.hpp"
 #include "ck_tile/builder/conv_builder.hpp"
+#include "tile_profiler_utils.hpp"
 
 namespace ck_tile::builder::profiling {
 
@@ -28,26 +29,6 @@ namespace ckt = ck_tile::builder::test;
 #include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_bf16.inc"
 #include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_fp16.inc"
 
-std::vector<int> get_split_k_values(const std::string& split_k)
-{
-    std::vector<int> split_k_list = {/*auto deduce value*/ -1, 1, 2, 4, 8, 16, 32, 64, 128};
-
-    if(split_k != "all")
-    {
-        try
-        {
-            int split_k_value = std::stoi(split_k);
-            split_k_list      = {split_k_value};
-        }
-        catch(const std::exception& e)
-        {
-            std::cerr << e.what() << '\n';
-            exit(EXIT_FAILURE);
-        }
-    }
-    return split_k_list;
-}
-
 template <auto SIGNATURE>
 void run_cpu_validation(const ckt::Args<SIGNATURE>& args,
                         const ckt::Outputs<SIGNATURE>& outputs,
@@ -71,36 +52,6 @@ void run_cpu_validation(const ckt::Args<SIGNATURE>& args,
     ck_tile::check_err(wei, ref, "\tError: Incorrect results!");
 }
 
-template <auto SIGNATURE>
-std::tuple<double, double>
-get_rtol_atol(const int num_accums, const int k_batch, const float max_accumulated_value)
-{
-    using WeiDataType =
-        std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP32,
-                           float,
-                           std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP16,
-                                              ck_tile::half_t,
-                                              ck_tile::bfloat16_t>>;
-    using ComputeType = WeiDataType;
-    using AccDataType = float;
-
-    // Assign middle value of the range for auto deduce
-    const int num_accums_split_k = k_batch > 0 ? k_batch : 64;
-    auto rtol = ck_tile::get_relative_threshold<ComputeType, WeiDataType, AccDataType>(
-        num_accums / num_accums_split_k);
-    auto atol = ck_tile::get_absolute_threshold<ComputeType, WeiDataType, AccDataType>(
-        max_accumulated_value / num_accums_split_k, num_accums / num_accums_split_k);
-    // Calculate error due to split_k accumulation
-    auto rtol_split_k =
-        ck_tile::get_relative_threshold<WeiDataType, WeiDataType, WeiDataType>(num_accums_split_k);
-    auto atol_split_k = ck_tile::get_absolute_threshold<WeiDataType, WeiDataType, WeiDataType>(
-        max_accumulated_value, num_accums_split_k);
-    // Use higher threshold
-    rtol = std::max(rtol, rtol_split_k);
-    atol = std::max(atol, atol_split_k);
-    return std::make_tuple(rtol, atol);
-}
-
 /// @brief `run_grouped_conv_backward_weight_tile_algs()` run all grouped conv fwd instances.
 ///
 /// @tparam SIGNATURE Forward convolution signature.
@@ -178,11 +129,11 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args<SIGNATURE>& args,
                     });
 
                 const bool valid = report.get_errors().empty();
+                best_avg_time    = std::min(best_avg_time, avg_time);
+                best_op_name     = best_avg_time < avg_time ? best_op_name : op_name;
+                best_split_k     = best_avg_time < avg_time ? best_split_k : k_batch;
                 if(valid)
                 {
-                    best_avg_time = std::min(best_avg_time, avg_time);
-                    best_op_name  = best_avg_time < avg_time ? best_op_name : op_name;
-                    best_split_k  = best_avg_time < avg_time ? best_split_k : k_batch;
                     std::cout << "[Valid] Perf: " << std::setw(10) << avg_time << " ms," << " "
                               << op_name << ", SplitK " << k_batch << std::endl;
                 }
diff --git a/profiler/include/profiler/grouped_convolution_signatures.hpp b/profiler/include/profiler/grouped_convolution_signatures.hpp
index 6917d8588d..7d9384f0cc 100644
--- a/profiler/include/profiler/grouped_convolution_signatures.hpp
+++ b/profiler/include/profiler/grouped_convolution_signatures.hpp
@@ -5,124 +5,5 @@
 
 #include <tuple>
 
-#include "../../experimental/builder/test/impl/conv_signature_types.hpp"
+#include "../../experimental/grouped_convolution_tile_instances/include/signatures.hpp"
 #include "ck_tile/builder/testing/conv/ck_tile.hpp"
-
-namespace ck_tile::builder::profiling {
-
-namespace ckb = ck_tile::builder;
-namespace ckt = ck_tile::builder::test;
-
-constexpr auto SIGNATURE_NHWGC_FP32_FWD =
-    ckt::ConvSignature{.spatial_dim            = 2,
-                       .direction              = ckb::ConvDirection::FORWARD,
-                       .data_type              = ckb::DataType::FP32,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
-
-constexpr auto SIGNATURE_NHWGC_BF16_FWD =
-    ckt::ConvSignature{.spatial_dim            = 2,
-                       .direction              = ckb::ConvDirection::FORWARD,
-                       .data_type              = ckb::DataType::BF16,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
-
-constexpr auto SIGNATURE_NHWGC_FP16_FWD =
-    ckt::ConvSignature{.spatial_dim            = 2,
-                       .direction              = ckb::ConvDirection::FORWARD,
-                       .data_type              = ckb::DataType::FP16,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
-
-constexpr auto SIGNATURE_NDHWGC_FP32_FWD =
-    ckt::ConvSignature{.spatial_dim            = 3,
-                       .direction              = ckb::ConvDirection::FORWARD,
-                       .data_type              = ckb::DataType::FP32,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
-
-constexpr auto SIGNATURE_NDHWGC_BF16_FWD =
-    ckt::ConvSignature{.spatial_dim            = 3,
-                       .direction              = ckb::ConvDirection::FORWARD,
-                       .data_type              = ckb::DataType::BF16,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
-
-constexpr auto SIGNATURE_NDHWGC_FP16_FWD =
-    ckt::ConvSignature{.spatial_dim            = 3,
-                       .direction              = ckb::ConvDirection::FORWARD,
-                       .data_type              = ckb::DataType::FP16,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
-
-/////////////////////////////////////////
-// BWD WEIGHT signatures
-//////////////////////////////////////////
-
-constexpr auto SIGNATURE_NHWGC_BF16_BWD_WEIGHT =
-    ckt::ConvSignature{.spatial_dim            = 2,
-                       .direction              = ckb::ConvDirection::BACKWARD_WEIGHT,
-                       .data_type              = ckb::DataType::BF16,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
-
-constexpr auto SIGNATURE_NHWGC_FP16_BWD_WEIGHT =
-    ckt::ConvSignature{.spatial_dim            = 2,
-                       .direction              = ckb::ConvDirection::BACKWARD_WEIGHT,
-                       .data_type              = ckb::DataType::FP16,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
-
-constexpr auto SIGNATURE_NHWGC_FP32_BWD_WEIGHT =
-    ckt::ConvSignature{.spatial_dim            = 2,
-                       .direction              = ckb::ConvDirection::BACKWARD_WEIGHT,
-                       .data_type              = ckb::DataType::FP32,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
-
-constexpr auto SIGNATURE_NDHWGC_BF16_BWD_WEIGHT =
-    ckt::ConvSignature{.spatial_dim            = 3,
-                       .direction              = ckb::ConvDirection::BACKWARD_WEIGHT,
-                       .data_type              = ckb::DataType::BF16,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
-
-constexpr auto SIGNATURE_NDHWGC_FP16_BWD_WEIGHT =
-    ckt::ConvSignature{.spatial_dim            = 3,
-                       .direction              = ckb::ConvDirection::BACKWARD_WEIGHT,
-                       .data_type              = ckb::DataType::FP16,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
-
-constexpr auto SIGNATURE_NDHWGC_FP32_BWD_WEIGHT =
-    ckt::ConvSignature{.spatial_dim            = 3,
-                       .direction              = ckb::ConvDirection::BACKWARD_WEIGHT,
-                       .data_type              = ckb::DataType::FP32,
-                       .accumulation_data_type = ckb::DataType::FP32,
-                       .input                  = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
-                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
-                       .output                 = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
-
-} // namespace ck_tile::builder::profiling
diff --git a/profiler/include/profiler/tile_profiler_utils.hpp b/profiler/include/profiler/tile_profiler_utils.hpp
index eb870d8a61..047c4a3acb 100644
--- a/profiler/include/profiler/tile_profiler_utils.hpp
+++ b/profiler/include/profiler/tile_profiler_utils.hpp
@@ -4,14 +4,70 @@
 #pragma once
 
 #include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
 #include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp"
 
 namespace ck_tile::builder::profiling {
 
 namespace ckt = ck_tile::builder::test;
 
+inline std::vector<int> get_split_k_values(const std::string& split_k)
+{
+    std::vector<int> split_k_list = {/*auto deduce value*/ -1, 1, 2, 4, 8, 16, 32, 64, 128};
+
+    if(split_k != "all")
+    {
+        try
+        {
+            int split_k_value = std::stoi(split_k);
+            split_k_list      = {split_k_value};
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << e.what() << '\n';
+            exit(EXIT_FAILURE);
+        }
+    }
+    return split_k_list;
+}
+
 template <auto SIGNATURE>
-auto parse_conv_args(int arg_idx, char* const argv[])
+inline std::tuple<double, double>
+get_rtol_atol(const int num_accums, const int k_batch, const float max_accumulated_value)
+{
+    using DataType =
+        std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP32,
+                           float,
+                           std::conditional_t<SIGNATURE.data_type == ckb::DataType::FP16,
+                                              ck_tile::half_t,
+                                              ck_tile::bfloat16_t>>;
+    using ComputeType = DataType;
+    using AccDataType = float;
+
+    // Assign middle value of the range for auto deduce
+    const int num_accums_split_k = k_batch > 0 ? k_batch : 64;
+    auto rtol = ck_tile::get_relative_threshold<ComputeType, DataType, AccDataType>(
+        num_accums / num_accums_split_k);
+    auto atol = ck_tile::get_absolute_threshold<ComputeType, DataType, AccDataType>(
+        max_accumulated_value / num_accums_split_k, num_accums / num_accums_split_k);
+    // Calculate error due to split_k accumulation
+    auto rtol_split_k =
+        ck_tile::get_relative_threshold<DataType, DataType, DataType>(num_accums_split_k);
+    auto atol_split_k = ck_tile::get_absolute_threshold<DataType, DataType, DataType>(
+        max_accumulated_value, num_accums_split_k);
+    // Use higher threshold
+    rtol = std::max(rtol, rtol_split_k);
+    atol = std::max(atol, atol_split_k);
+    return std::make_tuple(rtol, atol);
+}
+
+template <auto SIGNATURE>
+inline ckt::Args<SIGNATURE> parse_conv_args(int arg_idx, char* const argv[])
 {
     const std::size_t G = static_cast<size_t>(std::stol(argv[arg_idx++]));
     const std::size_t N = static_cast<size_t>(std::stol(argv[arg_idx++]));
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 2917b79f0b..526d2fa8b2 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -46,6 +46,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   if(CK_EXPERIMENTAL_BUILDER)
     list(APPEND PROFILER_OPS profile_grouped_conv_fwd_tile.cpp)
     list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight_tile.cpp)
+    list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data_tile.cpp)
   endif()
 endif()
 
@@ -275,6 +276,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   if(CK_EXPERIMENTAL_BUILDER)
     list(APPEND DEVICE_INSTANCES device_grouped_conv_fwd_tile_instances)
     list(APPEND DEVICE_INSTANCES device_grouped_conv_bwd_weight_tile_instances)
+    list(APPEND DEVICE_INSTANCES device_grouped_conv_bwd_data_tile_instances)
   endif()
 endif()
 
diff --git a/profiler/src/profile_grouped_conv_bwd_data_tile.cpp b/profiler/src/profile_grouped_conv_bwd_data_tile.cpp
new file mode 100644
index 0000000000..fe51056805
--- /dev/null
+++ b/profiler/src/profile_grouped_conv_bwd_data_tile.cpp
@@ -0,0 +1,218 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <string>
+
+#include "ck_tile/builder/testing/conv/ck_tile.hpp"
+#include "ck_tile/host/device_prop.hpp"
+#include "profiler/grouped_convolution_backward_data_tile_algs.hpp"
+#include "profiler/tile_profiler_utils.hpp"
+#include "profiler/profiler_arg_utils.hpp"
+
+#include "profiler_operation_registry.hpp"
+
+namespace {
+
+enum struct ConvLayout
+{
+    GNHWC_GKYXC_GNHWK, // 0
+    NHWGC_GKYXC_NHWGK, // 1
+    NGCHW_GKYXC_NGKHW, // 2
+    NGCHW_GKCYX_NGKHW, // 3
+};
+
+enum struct ConvDataType
+{
+    F32_F32_F32,      // 0
+    F16_F16_F16,      // 1
+    BF16_BF16_BF16,   // 2
+    F32_F32_F32_TF32, // 3
+};
+
+#define OP_NAME "grouped_conv_bwd_data_tile"
+#define OP_DESC "Grouped Convolution Backward Data (CK Tile)"
+
+static void print_helper_msg()
+{
+    std::cout
+        // clang-format off
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Output fp32, Weight fp32, Input fp32\n"
+        << "                 1: Output fp16, Weight fp16, Input fp16\n"
+        << "                 2: Output bf16, Weight bf16, Input bf16\n"
+        << "                 3: Output fp32, Weight fp32, Input fp32, Compute tf32)\n"
+        << "arg3: tensor layout (0: Output[G, N, Ho, Wo, C], Weight[G, K, Y, X, C], Input[G, N, Hi, Wi, K]\n"
+        << "                     1: Output[N, Ho, Wo, G, C], Weight[G, K, Y, X, C], Input[N, Hi, Wi, G, K])\n"
+        << "                     2: Output[N, G, C, Ho, Wo], Weight[G, K, Y, X, C], Input[N, G, K, Hi, Wi])\n"
+        << "                     3: Output[N, G, C, Ho, Wo], Weight[G, K, C, Y, X], Input[N, G, K, Hi, Wi])\n"
+        << "arg4: verification (0: no, 1: yes)\n"
+        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg6: print tensor value (0: no; 1: yes)\n"
+        << "arg7: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl
+        << "Last argument: split-K (0: internally computed split-K value; 1, 2, 4, 8, 16, 32, 64, 128: set k batches explicitly)\n"
+        << "\nOptional arguments:\n"
+        << "  --instance <id>      Run only the specified instance (0-indexed among valid instances)\n";
+    // clang-format on
+}
+
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+namespace ckp = ck_tile::builder::profiling;
+
+template <auto SIGNATURE>
+int call_profiler(const ckt::Args<SIGNATURE>& args,
+                  const std::string& split_k,
+                  bool time_kernel,
+                  ck_tile::index_t instance_index)
+{
+    auto inputs  = ckt::alloc_inputs(args);
+    auto outputs = ckt::alloc_outputs(args);
+    ckt::init_inputs(args, inputs.get());
+
+    std::cout << args.make_input_descriptor() << std::endl;
+    std::cout << args.make_weight_descriptor() << std::endl;
+    std::cout << args.make_output_descriptor() << std::endl;
+    auto&& [valid, avg_time, op_name, best_split_k, best_instance_index] =
+        ckp::run_grouped_conv_backward_data_tile_algs(
+            args,
+            split_k,
+            instance_index,
+            inputs.get(),
+            outputs.get(),
+            ck_tile::stream_config{nullptr,
+                                   time_kernel,
+                                   0 /*log_level*/,
+                                   5 /*cold_iters*/,
+                                   50 /*nrepeat_*/,
+                                   true /*is_gpu_timer_*/});
+    if(time_kernel)
+    {
+        std::cout << "\nBest configuration parameters:" << "\n\tname: " << op_name << " (instance "
+                  << best_instance_index << ")" << "\n\tavg_time: " << avg_time << ", SplitK "
+                  << best_split_k << std::endl;
+    }
+    return !valid;
+}
+
+} // namespace
+
+int profile_grouped_conv_bwd_data_tile(int argc, char* argv[])
+{
+    // Parse optional named arguments first
+    ck_tile::index_t instance_index = -1;
+    bool dummy;
+    ck::profiler::parse_named_args(argc, argv, instance_index, dummy);
+    const int named_arg_count = ck::profiler::count_named_args(argc, argv);
+
+    // Adjust argc for positional argument checking
+    const int positional_argc = argc - named_arg_count;
+
+    // 8 for control, 1 for num_dim_spatial
+    if(positional_argc < 9)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type      = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto layout         = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const bool time_kernel    = std::stoi(argv[7]);
+    const int num_dim_spatial = std::stoi(argv[8]);
+
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K
+    if(positional_argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    constexpr ck_tile::index_t conv_params_start_idx = 9;
+    const auto params =
+        ck::utils::conv::parse_conv_param(num_dim_spatial, conv_params_start_idx, argv);
+    std::cout << params << std::endl;
+
+    auto split_k = std::string(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
+
+    // The bwd data profiler in old CK uses -1 to loop over all split-K values.
+    // We want to have the same API for backward compatibility, but we need to convert it to "all"
+    // for the new API.
+    if(split_k == "-1")
+    {
+        split_k = "all";
+    }
+
+    if(layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(num_dim_spatial == 2)
+        {
+            if(data_type == ConvDataType::F16_F16_F16)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP16_BWD_DATA;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel,
+                    instance_index);
+            }
+            else if(data_type == ConvDataType::BF16_BF16_BF16)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_BF16_BWD_DATA;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel,
+                    instance_index);
+            }
+            else if(data_type == ConvDataType::F32_F32_F32)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP32_BWD_DATA;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel,
+                    instance_index);
+            }
+        }
+        else if(num_dim_spatial == 3)
+        {
+            if(data_type == ConvDataType::F16_F16_F16)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP16_BWD_DATA;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel,
+                    instance_index);
+            }
+            else if(data_type == ConvDataType::BF16_BF16_BF16)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_BF16_BWD_DATA;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel,
+                    instance_index);
+            }
+            else if(data_type == ConvDataType::F32_F32_F32)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP32_BWD_DATA;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel,
+                    instance_index);
+            }
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_bwd_data_tile);
diff --git a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp
index 7ee82fe8a9..46903bd731 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp
@@ -24,7 +24,7 @@ enum struct ConvLayout
     NGCHW_GKCYX_NGKHW, // 4
 };
 
-std::ostream& operator<<(std::ostream& os, const ConvLayout& layout)
+std::ostream& operator<<([[clang::lifetimebound]] std::ostream& os, const ConvLayout& layout)
 {
     using ck::operator<<;
     switch(layout)
@@ -61,7 +61,7 @@ enum struct ConvDataType
     F32_F32_F32_COMP_TF32 // 6
 };
 
-std::ostream& operator<<(std::ostream& os, const ConvDataType& data_type)
+std::ostream& operator<<([[clang::lifetimebound]] std::ostream& os, const ConvDataType& data_type)
 {
     using ck::operator<<;
     switch(data_type)
diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp
index 096a2d4eb4..31cc0fd23a 100644
--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -278,11 +278,4 @@ int profile_softmax(int argc, char* argv[])
     return 0;
 }
 
-// hijack main() for quick debugging
-// int main(int argc, char* argv[])
-// {
-//     profile_normalization(argc, argv);
-//     return 0;
-// }
-
 REGISTER_PROFILER_OPERATION("softmax", "Softmax", profile_softmax);
diff --git a/script/dependency-parser/ci_safety_check.sh b/script/dependency-parser/ci_safety_check.sh
index adfe7d7f09..bd19a0630f 100755
--- a/script/dependency-parser/ci_safety_check.sh
+++ b/script/dependency-parser/ci_safety_check.sh
@@ -18,8 +18,8 @@
 #   CHANGE_TARGET - Base branch for PR builds (set by Jenkins Multibranch Pipeline)
 #
 # Note: CHANGE_ID may not be set even for PR builds if Jenkins job is not
-# configured as Multibranch Pipeline. Script uses three-dot git diff syntax
-# to correctly detect PR changes regardless of CHANGE_ID availability.
+# configured as Multibranch Pipeline. Script uses two-dot git diff syntax
+# to detect PR changes regardless of CHANGE_ID availability.
 #
 # Manual override (set by developer/admin if needed):
 #   DISABLE_SMART_BUILD - Set to "true" to force full build
@@ -48,19 +48,29 @@ fi
 
 # 3. Force full build if CMakeLists.txt or cmake/ configuration changed
 # Always compare against base branch (not consecutive commits) to avoid false positives from merge commits
-# Three-dot syntax (...) only shows changes actually made in the PR, not changes from merged develop branch
-if [ -n "$CHANGE_ID" ]; then
-    # This is a PR build (CHANGE_ID set by Jenkins Multibranch Pipeline)
-    CHANGED_FILES=$(git diff --name-only origin/${BASE_BRANCH}...HEAD 2>/dev/null || echo "")
-else
-    # Fallback: Works for both branch builds and PRs without CHANGE_ID
-    # Use three-dot syntax to avoid including merge commit changes from develop
-    CHANGED_FILES=$(git diff --name-only origin/${BASE_BRANCH}...HEAD 2>/dev/null || echo "")
-fi
+# Two-dot syntax (..) compares current state against base branch
+# Note: This includes merged changes from develop, which is conservative but safe (catches all potentially affected files)
+CHANGED_FILES=$(git diff --name-only origin/${BASE_BRANCH}..HEAD 2>/dev/null || echo "")
 
-if echo "$CHANGED_FILES" | grep -qE "(CMakeLists\.txt|cmake/.*\.cmake)"; then
+# Comprehensive pattern for build/infrastructure files that require full build:
+# - CMake: CMakeLists.txt, *.cmake, *.cmake.in, CMakePresets.json
+# - Docker: Dockerfile*, docker-compose*
+# - CI/CD: Jenkinsfile, .github/, .gitlab-ci.yml, .pre-commit-config.yaml, .readthedocs.yaml
+# - Scripts: script/ directory (cmake, dependency-parser, build utilities)
+# - Compiler: .clang-format, .clang-tidy
+# - Python: setup.py, pyproject.toml, requirements*.txt
+BUILD_INFRA_PATTERN="(CMakeLists\.txt"
+BUILD_INFRA_PATTERN="${BUILD_INFRA_PATTERN}|\.cmake$|\.cmake\.in$|CMakePresets\.json"
+BUILD_INFRA_PATTERN="${BUILD_INFRA_PATTERN}|Dockerfile|docker-compose"
+BUILD_INFRA_PATTERN="${BUILD_INFRA_PATTERN}|Jenkinsfile|\.github/|\.gitlab-ci\.yml"
+BUILD_INFRA_PATTERN="${BUILD_INFRA_PATTERN}|\.pre-commit-config\.yaml|\.readthedocs\.yaml"
+BUILD_INFRA_PATTERN="${BUILD_INFRA_PATTERN}|script/"
+BUILD_INFRA_PATTERN="${BUILD_INFRA_PATTERN}|\.clang-format|\.clang-tidy"
+BUILD_INFRA_PATTERN="${BUILD_INFRA_PATTERN}|setup\.py|pyproject\.toml|requirements.*\.txt)"
+
+if echo "$CHANGED_FILES" | grep -qE "${BUILD_INFRA_PATTERN}"; then
     FORCE_FULL_BUILD=true
-    REASON="build system configuration changed (CMakeLists.txt or cmake/*.cmake)"
+    REASON="build system configuration changed"
 fi
 
 # 4. Force full build if dependency cache is older than 7 days
diff --git a/script/dependency-parser/src/selective_test_filter.py b/script/dependency-parser/src/selective_test_filter.py
index 329fdea518..551ed06eb0 100644
--- a/script/dependency-parser/src/selective_test_filter.py
+++ b/script/dependency-parser/src/selective_test_filter.py
@@ -29,6 +29,7 @@ import sys
 import subprocess
 import json
 import os
+import re
 
 
 def get_changed_files(ref1, ref2, project: str = None):
@@ -110,12 +111,18 @@ def get_ctest_registered_tests(build_dir=None):
             return None
 
         tests = set()
+        # CTest formats test numbers with variable spacing:
+        # Test   #1: name (3 spaces for 1-9)
+        # Test  #10: name (2 spaces for 10-99)
+        # Test #100: name (1 space for 100+)
+        # Use regex to match all formats
+        test_pattern = re.compile(r'^\s*Test\s+#\d+:\s*(.+)$')
+
         for line in result.stdout.splitlines():
-            if line.strip().startswith("Test #"):
-                parts = line.split(":", 1)
-                if len(parts) == 2:
-                    test_name = parts[1].strip()
-                    tests.add(test_name)
+            match = test_pattern.match(line)
+            if match:
+                test_name = match.group(1).strip()
+                tests.add(test_name)
 
         return tests
     except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
diff --git a/script/dependency-parser/validate_pr.sh b/script/dependency-parser/validate_pr.sh
index 61f185af8d..f8c77a2811 100755
--- a/script/dependency-parser/validate_pr.sh
+++ b/script/dependency-parser/validate_pr.sh
@@ -189,7 +189,7 @@ git log --oneline -5
 
 log_section "Step 3: Analyze Changed Files"
 log_info "Files changed vs $BASE_BRANCH:"
-CHANGED_FILES=$(git diff --name-only ${BASE_BRANCH}...HEAD -- projects/composablekernel)
+CHANGED_FILES=$(git diff --name-only ${BASE_BRANCH}..HEAD -- projects/composablekernel)
 NUM_FILES=$(echo "$CHANGED_FILES" | wc -l)
 echo "$CHANGED_FILES" | head -20
 if [ "$NUM_FILES" -gt 20 ]; then
diff --git a/script/infra_helper/send_failure_notifications.sh b/script/infra_helper/send_failure_notifications.sh
new file mode 100644
index 0000000000..70488bf4ae
--- /dev/null
+++ b/script/infra_helper/send_failure_notifications.sh
@@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+# send_failure_notifications.sh
+#
+# Scans the Jenkins build log for known infrastructure failure patterns and
+# sends a Teams webhook notification for each match.
+#
+# Required environment variables (Jenkins provides all except WEBHOOK_URL):
+#   BUILD_URL       - Jenkins build URL (e.g. http://host/job/foo/42/)
+#   JOB_NAME        - Jenkins job name
+#   BUILD_NUMBER    - Jenkins build number
+#   RUN_DISPLAY_URL - Jenkins Blue Ocean display URL
+#   WEBHOOK_URL     - Teams incoming webhook URL (passed via withCredentials)
+
+# Do not echo commands — the grep command contains all pattern strings and
+# would self-match if it appeared in the console log.
+set +x
+
+# ---------------------------------------------------------------------------
+# Failure patterns and their descriptions (parallel indexed arrays).
+# ---------------------------------------------------------------------------
+PATTERNS=(
+    'login attempt to .* failed with status: 401 Unauthorized'
+    'docker login failed'
+    'HTTP request sent .* 404 Not Found'
+    '/sys/module/amdgpu/version: No such file or directory'
+    'GPU not found'
+    'Could not connect to Redis at .* Connection timed out'
+    'unauthorized: your account must log in with a Personal Access Token'
+    'sccache: error: Server startup failed: Address in use'
+    'No space left on device'
+    'Could not resolve host: github.com'
+)
+
+DESCRIPTIONS=(
+    "Docker registry authentication failed"
+    "Docker login failed"
+    "HTTP request failed with 404"
+    "Missing drivers"
+    "GPU not found"
+    "Redis connection timed out"
+    "Docker login failed"
+    "Sccache Error"
+    "Device space error"
+    "Unable to access Github"
+)
+
+# Indices into PATTERNS/DESCRIPTIONS for which a node name lookup is performed.
+NODE_PATTERN_INDICES=(3 4 8 9) 
+
+# ---------------------------------------------------------------------------
+# Fetch and scan the log.
+# ---------------------------------------------------------------------------
+COMBINED_PATTERN=$(printf '%s\n' "${PATTERNS[@]}" | paste -sd '|')
+
+echo "Checking for failure patterns..."
+GREP_OUTPUT=$(wget -q --no-check-certificate -O - "${BUILD_URL}consoleText" \
+    | grep -E -B 2 -A 2 "${COMBINED_PATTERN}" || true)
+
+if [[ -z "$GREP_OUTPUT" ]]; then
+    echo "No failure patterns found in build log"
+    exit 0
+fi
+
+# ---------------------------------------------------------------------------
+# Process each grep context block.
+# ---------------------------------------------------------------------------
+# Track descriptions already notified to avoid duplicate notifications.
+declare -a NOTIFIED_DESCRIPTIONS=()
+
+process_block() {
+    local block="$1"
+    [[ -z "$block" ]] && return
+
+    for i in "${!PATTERNS[@]}"; do
+        local pattern="${PATTERNS[$i]}"
+        local description="${DESCRIPTIONS[$i]}"
+
+        # Skip if this description was already notified.
+        local already_notified=false
+        for notified in "${NOTIFIED_DESCRIPTIONS[@]:-}"; do
+            [[ "$notified" == "$description" ]] && already_notified=true && break
+        done
+        $already_notified && continue
+
+        # Check if this block contains the pattern.
+        if echo "$block" | grep -qE "$pattern"; then
+            NOTIFIED_DESCRIPTIONS+=("$description")
+
+            # For node-related patterns, find the most recent NODE_NAME before
+            # the failure via a single forward awk pass that exits immediately
+            # on the failure line, regardless of how many lines separate the two.
+            local node_name=""
+            for node_idx in "${NODE_PATTERN_INDICES[@]}"; do
+                if [[ "$node_idx" == "$i" ]]; then
+                    node_name=$(wget -q --no-check-certificate -O - "${BUILD_URL}consoleText" | awk '
+                        /NODE_NAME[[:space:]]*=/ { node = $NF }
+                        index($0, "'"$pattern"'") { print node; exit }
+                    ')
+                    break
+                fi
+            done
+
+            # Escape context for safe embedding in a JSON string value:
+            # backslashes first, then quotes, then newlines.
+            local escaped_context
+            escaped_context=$(printf '%s' "$block" \
+                | sed 's/\\/\\\\/g' \
+                | sed 's/"/\\"/g' \
+                | sed ':a;N;$!ba;s/\n/\\n/g')
+
+            # Build JSON payload and send notification.
+            echo "Sending notification for: $description"
+            {
+                printf '{\n'
+                printf '    "jobName": "%s",\n'      "$JOB_NAME"
+                printf '    "buildNumber": "%s",\n'  "$BUILD_NUMBER"
+                printf '    "jobUrl": "%s",\n'       "$RUN_DISPLAY_URL"
+                printf '    "detectedIssue": "%s",\n' "$description"
+                printf '    "logContext": "%s",\n'   "$escaped_context"
+                printf '    "nodeName": "%s"\n'      "$node_name"
+                printf '}\n'
+            } > webhook_payload.json
+
+            curl -X POST "$WEBHOOK_URL" \
+                -H "Content-Type: application/json" \
+                -d @webhook_payload.json
+
+            rm -f webhook_payload.json
+        fi
+    done
+}
+
+# grep separates non-adjacent match groups with a line containing just "--".
+# Read line by line, accumulate into a block, and process when the separator
+# is hit. The final block has no trailing "--" so it is processed after the loop.
+current_block=""
+while IFS= read -r line; do
+    if [[ "$line" == "--" ]]; then
+        process_block "$current_block"
+        current_block=""
+    else
+        current_block+="$line"$'\n'
+    fi
+done <<< "$GREP_OUTPUT"
+process_block "$current_block"
+
+echo "Done failure pattern checking and notifications"
diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
index 71cd12e534..e1b7272d61 100644
--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -11,8 +11,7 @@
 
 #include "profiler/profile_batched_gemm_reduce_impl.hpp"
 
-static ck::index_t param_mask     = 0xffff;
-static ck::index_t instance_index = -1;
+static ck::index_t param_mask = 0xffff;
 struct GemmParams
 {
     ck::index_t M;
@@ -105,15 +104,14 @@ int main(int argc, char** argv)
 {
     testing::InitGoogleTest(&argc, argv);
     if(argc == 1) {}
-    else if(argc == 3)
+    else if(argc == 2)
     {
-        param_mask     = strtol(argv[1], nullptr, 0);
-        instance_index = atoi(argv[2]);
+        param_mask = strtol(argv[1], nullptr, 0);
     }
     else
     {
         std::cout << "Usage of " << argv[0] << std::endl;
-        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+        std::cout << "Arg1: param_mask " << std::endl;
     }
     return RUN_ALL_TESTS();
 }
diff --git a/test/block_swizzle_test/block_swizzle_test.cpp b/test/block_swizzle_test/block_swizzle_test.cpp
index 36a26492cf..af1bc0658e 100644
--- a/test/block_swizzle_test/block_swizzle_test.cpp
+++ b/test/block_swizzle_test/block_swizzle_test.cpp
@@ -120,17 +120,7 @@ struct block_dispatcher_t
 
     uint32_t get_grid_dims_x() { return dp_start_block_idx + dp_num_blocks; }
 
-    uint32_t get_block_idx(uint32_t bid)
-    {
-        // block id is linearily allocated along sk blocks (dp blocks are fine)
-        // this function will compute blockIdx.x and the linear sk block mapping
-        // uint32_t block_idx = 0;
-        // if(bid < sk_num_big_blocks) {
-        //     uint32_t current_k_iter = bid * k_iters_per_big_block;
-        //     tile_idx = current_k_iter / k_iters_per_tile;
-        // }
-        return bid;
-    }
+    uint32_t get_block_idx(uint32_t bid) { return bid; }
 
     uint32_t get_current_itr(uint32_t block_idx)
     {
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 320e5b1e91..63bf174643 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -69,4 +69,4 @@ add_subdirectory(fmha)
 add_subdirectory(gemm_tile_engine)
 add_subdirectory(pooling)
 add_subdirectory(grouped_conv)
-add_subdirectory(gemm_streamk_tile_engine)
+add_subdirectory(pooling_tile_engine)
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
index 8f4813a47e..ca49114844 100644
--- a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
index e357d7e3ac..f754d8e959 100644
--- a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
diff --git a/test/ck_tile/core/container/unit_sequence.cpp b/test/ck_tile/core/container/unit_sequence.cpp
index 3769d6ecf9..2ce0d0f7e8 100644
--- a/test/ck_tile/core/container/unit_sequence.cpp
+++ b/test/ck_tile/core/container/unit_sequence.cpp
@@ -355,6 +355,102 @@ TEST(SequenceSort, SortSingleElement)
     EXPECT_TRUE((std::is_same<Result, Expected>::value));
 }
 
+// Test sequence_sort sorted2unsorted_map (index tracking)
+TEST(SequenceSort, SortedMapUnsorted)
+{
+    using Seq  = sequence<5, 2, 8, 1, 9>;
+    using Sort = sequence_sort<Seq, less<index_t>>;
+    using Map  = typename Sort::sorted2unsorted_map;
+    // sorted = <1,2,5,8,9>, original indices = <3,1,0,2,4>
+    using Expected = sequence<3, 1, 0, 2, 4>;
+    EXPECT_TRUE((std::is_same<Map, Expected>::value));
+}
+
+TEST(SequenceSort, SortedMapAlreadySorted)
+{
+    using Seq  = sequence<1, 2, 3, 4, 5>;
+    using Sort = sequence_sort<Seq, less<index_t>>;
+    using Map  = typename Sort::sorted2unsorted_map;
+    // Already sorted: map should be identity
+    using Expected = sequence<0, 1, 2, 3, 4>;
+    EXPECT_TRUE((std::is_same<Map, Expected>::value));
+}
+
+TEST(SequenceSort, SortedMapRoundTrip)
+{
+    // Verify: sorted_values[i] == original[sorted2unsorted_map[i]]
+    using Seq  = sequence<5, 2, 8, 1, 9>;
+    using Sort = sequence_sort<Seq, less<index_t>>;
+    // sorted = <1,2,5,8,9>, map = <3,1,0,2,4>
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(0)), Sort::type::at(0));
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(1)), Sort::type::at(1));
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(2)), Sort::type::at(2));
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(3)), Sort::type::at(3));
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(4)), Sort::type::at(4));
+}
+
+TEST(SequenceSort, SortedMapWithDuplicates)
+{
+    using Seq    = sequence<3, 1, 3, 1>;
+    using Sort   = sequence_sort<Seq, less<index_t>>;
+    using Sorted = typename Sort::type;
+    using Map    = typename Sort::sorted2unsorted_map;
+    // sorted = <1,1,3,3>
+    using ExpectedSorted = sequence<1, 1, 3, 3>;
+    EXPECT_TRUE((std::is_same<Sorted, ExpectedSorted>::value));
+    // Verify round-trip: original[map[i]] == sorted[i] for all i
+    // (don't assert specific index order for duplicates — sort stability may vary)
+    EXPECT_EQ(Seq::at(Map::at(0)), Sorted::at(0));
+    EXPECT_EQ(Seq::at(Map::at(1)), Sorted::at(1));
+    EXPECT_EQ(Seq::at(Map::at(2)), Sorted::at(2));
+    EXPECT_EQ(Seq::at(Map::at(3)), Sorted::at(3));
+}
+
+TEST(SequenceSort, SortedMapReverseSorted)
+{
+    using Seq       = sequence<5, 4, 3, 2, 1>;
+    using Sort      = sequence_sort<Seq, less<index_t>>;
+    using Sorted    = typename Sort::type;
+    using Map       = typename Sort::sorted2unsorted_map;
+    using ExpSorted = sequence<1, 2, 3, 4, 5>;
+    using ExpMap    = sequence<4, 3, 2, 1, 0>;
+    EXPECT_TRUE((std::is_same<Sorted, ExpSorted>::value));
+    EXPECT_TRUE((std::is_same<Map, ExpMap>::value));
+}
+
+TEST(SequenceSort, SortedMapEmpty)
+{
+    using Sort = sequence_sort<sequence<>, less<index_t>>;
+    using Map  = typename Sort::sorted2unsorted_map;
+    EXPECT_TRUE((std::is_same<Map, sequence<>>::value));
+}
+
+TEST(SequenceSort, SortedMapSingleElement)
+{
+    using Sort = sequence_sort<sequence<42>, less<index_t>>;
+    using Map  = typename Sort::sorted2unsorted_map;
+    EXPECT_TRUE((std::is_same<Map, sequence<0>>::value));
+}
+
+// Test sequence_unique_sort sorted2unsorted_map
+TEST(SequenceUniqueSort, UniqueSortMap)
+{
+    using Seq    = sequence<3, 1, 4, 1, 5, 9, 2, 6, 5>;
+    using Result = sequence_unique_sort<Seq, less<index_t>, equal<index_t>>;
+    using Map    = typename Result::sorted2unsorted_map;
+    // sorted unique = <1,2,3,4,5,6,9>
+    // The map should reference the first occurrence of each unique value in the original
+    // Verify round-trip: for each i, original[map[i]] == sorted_unique[i]
+    using Values = typename Result::type;
+    EXPECT_EQ(Seq::at(Map::at(0)), Values::at(0)); // 1
+    EXPECT_EQ(Seq::at(Map::at(1)), Values::at(1)); // 2
+    EXPECT_EQ(Seq::at(Map::at(2)), Values::at(2)); // 3
+    EXPECT_EQ(Seq::at(Map::at(3)), Values::at(3)); // 4
+    EXPECT_EQ(Seq::at(Map::at(4)), Values::at(4)); // 5
+    EXPECT_EQ(Seq::at(Map::at(5)), Values::at(5)); // 6
+    EXPECT_EQ(Seq::at(Map::at(6)), Values::at(6)); // 9
+}
+
 // Test sequence_unique_sort
 TEST(SequenceUniqueSort, UniqueSort)
 {
@@ -405,6 +501,24 @@ TEST(SequenceMap, InvalidMapMissing)
     EXPECT_FALSE((is_valid_sequence_map<Map>::value));
 }
 
+TEST(SequenceMap, InvalidMapNegative)
+{
+    using Map = sequence<0, -1, 2>;
+    EXPECT_FALSE((is_valid_sequence_map<Map>::value));
+}
+
+TEST(SequenceMap, ValidMapSingleElement)
+{
+    EXPECT_TRUE((is_valid_sequence_map<sequence<0>>::value));
+}
+
+TEST(SequenceMap, InvalidMapSingleElement)
+{
+    EXPECT_FALSE((is_valid_sequence_map<sequence<1>>::value));
+}
+
+TEST(SequenceMap, ValidMapEmpty) { EXPECT_TRUE((is_valid_sequence_map<sequence<>>::value)); }
+
 // Test sequence_map_inverse
 // Note: sequence_map_inverse inverts a mapping where Map[i] = j means old position i maps to new
 // position j The inverse gives us new position i came from old position inverse[i]
diff --git a/test/ck_tile/fmha/test_fmha_bwd.cpp b/test/ck_tile/fmha/test_fmha_bwd.cpp
index e1035bffe4..3aee76131e 100644
--- a/test/ck_tile/fmha/test_fmha_bwd.cpp
+++ b/test/ck_tile/fmha/test_fmha_bwd.cpp
@@ -91,7 +91,8 @@ void fmha_bwd_test(const FmhaBwdTestParam& param)
         drop_offset,
         drop_prefs,
         mask_str,
-        det, // deterministic
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -333,7 +334,8 @@ TEST_P(BasicQPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -419,7 +421,8 @@ TEST_P(BasicKVPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -513,7 +516,8 @@ TEST_P(QKVPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -620,7 +624,8 @@ TEST_P(ZeroLengthPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -741,7 +746,8 @@ TEST_P(VariedPaddingRatios, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -843,7 +849,8 @@ TEST_P(PaddingWithMask, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -977,7 +984,8 @@ TEST_P(MultiBatchPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compiler.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compiler.cpp
deleted file mode 100644
index bda1f55b6a..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_compiler.cpp
+++ /dev/null
@@ -1,900 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_pipeline_kernel_types.hpp"
-#include "test_gemm_pipeline_util.hpp"
-#include "gtest/gtest.h"
-
-// ============================================================================
-// Comprehensive GEMM Compiler Validation Test Suite
-// This file consolidates all GEMM pipeline tests for compiler validation
-// Covers essential combinations of data types, layouts, and pipeline types
-// ============================================================================
-
-// ----------------------------------------------------------------------------
-// Test Class Definitions for Different Pipeline Types
-// ----------------------------------------------------------------------------
-
-template <typename T>
-class TestGemmMem : public TestCkTileGemmPipeline<T, TestGemmMem<T>>
-{
-};
-
-#if defined(CK_TILE_USE_WMMA)
-template <typename T>
-class TestGemmMemWmma : public TestCkTileGemmPipeline<T, TestGemmMemWmma<T>>
-{
-};
-#endif
-
-template <typename T>
-class TestGemmCompV3 : public TestCkTileGemmPipeline<T, TestGemmCompV3<T>>
-{
-};
-
-#if defined(CK_TILE_USE_WMMA)
-template <typename T>
-class TestGemmCompV3Wmma : public TestCkTileGemmPipeline<T, TestGemmCompV3Wmma<T>>
-{
-};
-#endif
-
-template <typename T>
-class TestGemmCompV4 : public TestCkTileGemmPipeline<T, TestGemmCompV4<T>>
-{
-};
-
-#if defined(CK_TILE_USE_WMMA)
-template <typename T>
-class TestGemmCompV4Wmma : public TestCkTileGemmPipeline<T, TestGemmCompV4Wmma<T>>
-{
-};
-#endif
-
-template <typename T>
-class TestGemmCompV6 : public TestCkTileGemmPipeline<T, TestGemmCompV6<T>>
-{
-};
-
-template <typename T>
-class TestGemmPersistent : public TestCkTileGemmPipeline<T, TestGemmPersistent<T>>
-{
-};
-
-#if defined(CK_TILE_USE_WMMA)
-template <typename T>
-class TestGemmPersistentWmma : public TestCkTileGemmPipeline<T, TestGemmPersistentWmma<T>>
-{
-};
-#endif
-
-// ----------------------------------------------------------------------------
-// Type Definitions for Each Pipeline Configuration
-// ----------------------------------------------------------------------------
-
-// Memory Pipeline Types
-using MemTestTypes = ::testing::Types<
-    // Parameters: ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType,
-    // M_BlockSize, N_BlockSize, K_BlockSize, M_TileSize, N_TileSize, K_TileSize, Scheduler,
-    // PipelineType
-
-    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Interwave, Mem>,
-    std::tuple<Row, Row, Row, BF16, BF16, F32, BF16, I64, I64, I32, I16, I16, I16, Interwave, Mem>>;
-
-#if defined(CK_TILE_USE_WMMA)
-// Memory Pipeline WMMA Types
-using MemWmmaTestTypes = ::testing::Types<
-    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Interwave, Mem>,
-    std::tuple<Row, Row, Row, BF16, BF16, F32, BF16, I64, I64, I32, I16, I16, I16, Interwave, Mem>>;
-#endif
-
-// CompV3 Pipeline Types
-using CompV3TestTypes = ::testing::Types<
-    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV3>,
-    std::tuple<Row,
-               Row,
-               Row,
-               BF16,
-               BF16,
-               F32,
-               F16,
-               I64,
-               I64,
-               I32,
-               I16,
-               I16,
-               I16,
-               Intrawave,
-               CompV3>>;
-
-#if defined(CK_TILE_USE_WMMA)
-// CompV3 Pipeline WMMA Types
-using CompV3WmmaTestTypes = ::testing::Types<
-    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV3>,
-    std::tuple<Row,
-               Row,
-               Row,
-               BF16,
-               BF16,
-               F32,
-               F16,
-               I64,
-               I64,
-               I32,
-               I16,
-               I16,
-               I16,
-               Intrawave,
-               CompV3>>;
-#endif
-
-// CompV4 Pipeline Types
-using CompV4TestTypes = ::testing::Types<
-    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV4>,
-    std::tuple<Row,
-               Row,
-               Row,
-               BF16,
-               BF16,
-               F32,
-               F16,
-               I64,
-               I64,
-               I32,
-               I16,
-               I16,
-               I16,
-               Intrawave,
-               CompV4>>;
-
-#if defined(CK_TILE_USE_WMMA)
-// CompV4 Pipeline WMMA Types
-using CompV4WmmaTestTypes = ::testing::Types<
-    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV4>,
-    std::tuple<Row,
-               Row,
-               Row,
-               BF16,
-               BF16,
-               F32,
-               F16,
-               I64,
-               I64,
-               I32,
-               I16,
-               I16,
-               I16,
-               Intrawave,
-               CompV4>>;
-#endif
-
-// CompV6 Pipeline Types
-using CompV6TestTypes = ::testing::Types<
-    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV6>,
-    std::tuple<Row,
-               Row,
-               Row,
-               BF16,
-               BF16,
-               F32,
-               F16,
-               I64,
-               I64,
-               I32,
-               I16,
-               I16,
-               I16,
-               Intrawave,
-               CompV6>>;
-
-// Persistent CompV3 Pipeline Types
-using PersistentTestTypes = ::testing::Types<std::tuple<Row,
-                                                        Col,
-                                                        Row,
-                                                        F16,
-                                                        F16,
-                                                        F32,
-                                                        F16,
-                                                        I64,
-                                                        I64,
-                                                        I32,
-                                                        I16,
-                                                        I16,
-                                                        I16,
-                                                        Intrawave,
-                                                        CompV3,
-                                                        Persistent>,
-                                             std::tuple<Row,
-                                                        Col,
-                                                        Row,
-                                                        F16,
-                                                        F16,
-                                                        F32,
-                                                        F16,
-                                                        I64,
-                                                        I64,
-                                                        I32,
-                                                        I16,
-                                                        I16,
-                                                        I16,
-                                                        Intrawave,
-                                                        CompV3,
-                                                        NonPersistent>>;
-
-#if defined(CK_TILE_USE_WMMA)
-// Persistent CompV3 Pipeline WMMA Types
-using PersistentWmmaTestTypes = ::testing::Types<std::tuple<Row,
-                                                            Col,
-                                                            Row,
-                                                            F16,
-                                                            F16,
-                                                            F32,
-                                                            F16,
-                                                            I64,
-                                                            I64,
-                                                            I32,
-                                                            I16,
-                                                            I16,
-                                                            I16,
-                                                            Intrawave,
-                                                            CompV3,
-                                                            Persistent>,
-                                                 std::tuple<Row,
-                                                            Col,
-                                                            Row,
-                                                            F16,
-                                                            F16,
-                                                            F32,
-                                                            F16,
-                                                            I64,
-                                                            I64,
-                                                            I32,
-                                                            I16,
-                                                            I16,
-                                                            I16,
-                                                            Intrawave,
-                                                            CompV3,
-                                                            NonPersistent>>;
-#endif
-
-// ----------------------------------------------------------------------------
-// Test Suite Registrations
-// ----------------------------------------------------------------------------
-
-TYPED_TEST_SUITE(TestGemmMem, MemTestTypes);
-#if defined(CK_TILE_USE_WMMA)
-TYPED_TEST_SUITE(TestGemmMemWmma, MemWmmaTestTypes);
-#endif
-TYPED_TEST_SUITE(TestGemmCompV3, CompV3TestTypes);
-#if defined(CK_TILE_USE_WMMA)
-TYPED_TEST_SUITE(TestGemmCompV3Wmma, CompV3WmmaTestTypes);
-#endif
-TYPED_TEST_SUITE(TestGemmCompV4, CompV4TestTypes);
-#if defined(CK_TILE_USE_WMMA)
-TYPED_TEST_SUITE(TestGemmCompV4Wmma, CompV4WmmaTestTypes);
-#endif
-TYPED_TEST_SUITE(TestGemmCompV6, CompV6TestTypes);
-TYPED_TEST_SUITE(TestGemmPersistent, PersistentTestTypes);
-#if defined(CK_TILE_USE_WMMA)
-TYPED_TEST_SUITE(TestGemmPersistentWmma, PersistentWmmaTestTypes);
-#endif
-
-// ============================================================================
-// Memory Pipeline Tests (Mem)
-// ============================================================================
-
-#define TEST_SUITE_NAME TestGemmMem
-
-TYPED_TEST(TEST_SUITE_NAME, SmallM_SingleRow)
-{
-    std::vector<int> Ms{1};
-    constexpr int N = 1024;
-    constexpr int K = TestFixture::K_Tile * 2;
-
-    for(int M : Ms)
-    {
-        if constexpr(std::is_same_v<typename TestFixture::ALayout,
-                                    ck_tile::tensor_layout::gemm::ColumnMajor>)
-        {
-            EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-        }
-        else
-        {
-            this->Run(M, N, K);
-        }
-    }
-}
-
-TYPED_TEST(TEST_SUITE_NAME, SingleTile)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, ExactlyTwoTiles_M)
-{
-    this->Run(TestFixture::M_Tile * 2, TestFixture::N_Tile, TestFixture::K_Tile * 2);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, ExactlyTwoTiles_N)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile * 2, TestFixture::K_Tile * 2);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, ExactlyTwoTiles_K)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile * 2);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Regular_512x1024x512)
-{
-    constexpr int M = 512;
-    constexpr int N = 1024;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Square_1024x1024x1024)
-{
-    constexpr int M = 1024;
-    constexpr int N = 1024;
-    constexpr int K = 1024;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_2048x2048x2048)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 2048;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, VeryLargeMatrix_4096x4096x4096)
-{
-    constexpr int M = 4096;
-    constexpr int N = 4096;
-    constexpr int K = 4096;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, TallSkinny_4096x128x1024)
-{
-    constexpr int M = 4096;
-    constexpr int N = 128;
-    constexpr int K = 1024;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, ShortWide_128x4096x1024)
-{
-    constexpr int M = 128;
-    constexpr int N = 4096;
-    constexpr int K = 1024;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, DeepNarrow_2048x2048x8192)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 8192;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, StressTest_ExtremelyTallMatrix)
-{
-    constexpr int M = 16384;
-    constexpr int N = 64;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, StressTest_ExtremelyWideMatrix)
-{
-    constexpr int M = 64;
-    constexpr int N = 16384;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, StressTest_VeryDeepK)
-{
-    constexpr int M = 1024;
-    constexpr int N = 1024;
-    constexpr int K = 16384;
-    this->Run(M, N, K);
-}
-
-#undef TEST_SUITE_NAME
-
-#if defined(CK_TILE_USE_WMMA)
-// ============================================================================
-// Memory Pipeline Tests with WMMA
-// ============================================================================
-
-#define TEST_SUITE_NAME TestGemmMemWmma
-
-TYPED_TEST(TEST_SUITE_NAME, SingleTile_WMMA)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Regular_WMMA)
-{
-    constexpr int M = 512;
-    constexpr int N = 1024;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_WMMA)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 2048;
-    this->Run(M, N, K);
-}
-
-#undef TEST_SUITE_NAME
-#endif // CK_TILE_USE_WMMA
-
-// ============================================================================
-// Compute V3 Pipeline Tests
-// ============================================================================
-
-#define TEST_SUITE_NAME TestGemmCompV3
-
-TYPED_TEST(TEST_SUITE_NAME, SmallM_CompV3)
-{
-    std::vector<int> Ms{1, 2};
-    constexpr int N = 1024;
-    std::vector<int> Ks;
-    for(auto K_count : {2, 4})
-    {
-        Ks.push_back(K_count * TestFixture::K_Tile);
-    }
-
-    for(int M : Ms)
-    {
-        for(int K : Ks)
-        {
-            if constexpr(std::is_same_v<typename TestFixture::ALayout,
-                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-            }
-            else
-            {
-                this->Run(M, N, K);
-            }
-        }
-    }
-}
-
-TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV3)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, MidLargeM_CompV3)
-{
-    std::vector<int> Ms{127, 255};
-    constexpr int N = 1024;
-
-    std::vector<int> Ks;
-    for(auto K_count : {2, 4})
-    {
-        Ks.push_back(K_count * TestFixture::K_Tile);
-    }
-
-    constexpr int VecLoadSize = (std::is_same_v<typename TestFixture::ADataType, ck_tile::fp8_t> ||
-                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::bf8_t> ||
-                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::int8_t>)
-                                    ? 16
-                                    : 8;
-
-    for(int M : Ms)
-    {
-        for(int K : Ks)
-        {
-            if constexpr(std::is_same_v<typename TestFixture::ALayout,
-                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                if(M % VecLoadSize == 0)
-                {
-                    this->Run(M, N, K);
-                }
-                else
-                {
-                    EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-                }
-            }
-            else
-            {
-                this->Run(M, N, K);
-            }
-        }
-    }
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Regular_CompV3)
-{
-    constexpr int M = 512;
-    constexpr int N = 1024;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV3)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 2048;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, BatchedSmall_CompV3)
-{
-    constexpr int M = 256;
-    constexpr int N = 256;
-    constexpr int K = 256;
-    this->Run(M, N, K);
-}
-
-#undef TEST_SUITE_NAME
-
-#if defined(CK_TILE_USE_WMMA)
-// ============================================================================
-// Compute V3 Pipeline Tests with WMMA
-// ============================================================================
-
-#define TEST_SUITE_NAME TestGemmCompV3Wmma
-
-TYPED_TEST(TEST_SUITE_NAME, SmallM_CompV3Wmma)
-{
-    std::vector<int> Ms{1, 2};
-    constexpr int N = 1024;
-    std::vector<int> Ks;
-    for(auto K_count : {2, 4})
-    {
-        Ks.push_back(K_count * TestFixture::K_Tile);
-    }
-
-    for(int M : Ms)
-    {
-        for(int K : Ks)
-        {
-            if constexpr(std::is_same_v<typename TestFixture::ALayout,
-                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-            }
-            else
-            {
-                this->Run(M, N, K);
-            }
-        }
-    }
-}
-
-TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV3Wmma)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Regular_CompV3Wmma)
-{
-    constexpr int M = 512;
-    constexpr int N = 1024;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV3Wmma)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 2048;
-    this->Run(M, N, K);
-}
-
-#undef TEST_SUITE_NAME
-#endif // CK_TILE_USE_WMMA
-
-// ============================================================================
-// Compute V4 Pipeline Tests
-// ============================================================================
-
-#define TEST_SUITE_NAME TestGemmCompV4
-
-TYPED_TEST(TEST_SUITE_NAME, SmallM_CompV4)
-{
-    std::vector<int> Ms{1, 2};
-    constexpr int N = 1024;
-    std::vector<int> Ks;
-    for(auto K_count : {2, 4})
-    {
-        Ks.push_back(K_count * TestFixture::K_Tile);
-    }
-
-    for(int M : Ms)
-    {
-        for(int K : Ks)
-        {
-            if constexpr(std::is_same_v<typename TestFixture::ALayout,
-                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-            }
-            else
-            {
-                this->Run(M, N, K);
-            }
-        }
-    }
-}
-
-TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV4)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Regular_CompV4)
-{
-    constexpr int M = 512;
-    constexpr int N = 1024;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV4)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 2048;
-    this->Run(M, N, K);
-}
-
-#undef TEST_SUITE_NAME
-
-#if defined(CK_TILE_USE_WMMA)
-// ============================================================================
-// Compute V4 Pipeline Tests with WMMA
-// ============================================================================
-
-#define TEST_SUITE_NAME TestGemmCompV4Wmma
-
-TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV4Wmma)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Regular_CompV4Wmma)
-{
-    constexpr int M = 512;
-    constexpr int N = 1024;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV4Wmma)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 2048;
-    this->Run(M, N, K);
-}
-
-#undef TEST_SUITE_NAME
-#endif // CK_TILE_USE_WMMA
-
-// ============================================================================
-// Compute V6 Pipeline Tests
-// ============================================================================
-
-#define TEST_SUITE_NAME TestGemmCompV6
-
-TYPED_TEST(TEST_SUITE_NAME, SmallM_CompV6)
-{
-    std::vector<int> Ms{1, 2};
-    constexpr int N = 1024;
-    std::vector<int> Ks;
-    for(auto K_count : {2, 4})
-    {
-        Ks.push_back(K_count * TestFixture::K_Tile);
-    }
-
-    for(int M : Ms)
-    {
-        for(int K : Ks)
-        {
-            if constexpr(std::is_same_v<typename TestFixture::ALayout,
-                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-            }
-            else
-            {
-                this->Run(M, N, K);
-            }
-        }
-    }
-}
-
-TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV6)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, MidLargeM_CompV6)
-{
-    std::vector<int> Ms{127, 255};
-    constexpr int N = 1024;
-
-    std::vector<int> Ks;
-    for(auto K_count : {2, 4})
-    {
-        Ks.push_back(K_count * TestFixture::K_Tile);
-    }
-
-    constexpr int VecLoadSize = (std::is_same_v<typename TestFixture::ADataType, ck_tile::fp8_t> ||
-                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::bf8_t> ||
-                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::int8_t>)
-                                    ? 16
-                                    : 8;
-
-    for(int M : Ms)
-    {
-        for(int K : Ks)
-        {
-            if constexpr(std::is_same_v<typename TestFixture::ALayout,
-                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                if(M % VecLoadSize == 0)
-                {
-                    this->Run(M, N, K);
-                }
-                else
-                {
-                    EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-                }
-            }
-            else
-            {
-                this->Run(M, N, K);
-            }
-        }
-    }
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Regular_CompV6)
-{
-    constexpr int M = 512;
-    constexpr int N = 1024;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV6)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 2048;
-    this->Run(M, N, K);
-}
-
-#undef TEST_SUITE_NAME
-
-// ============================================================================
-// Persistent Kernel Tests
-// ============================================================================
-
-#define TEST_SUITE_NAME TestGemmPersistent
-
-TYPED_TEST(TEST_SUITE_NAME, SmallM_Persistent)
-{
-    std::vector<int> Ms{1, 2};
-    constexpr int N = 1024;
-    std::vector<int> Ks;
-    for(auto K_count : {2, 4})
-    {
-        Ks.push_back(K_count * TestFixture::K_Tile);
-    }
-
-    for(int M : Ms)
-    {
-        for(int K : Ks)
-        {
-            if constexpr(std::is_same_v<typename TestFixture::ALayout,
-                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-            }
-            else
-            {
-                this->Run(M, N, K);
-            }
-        }
-    }
-}
-
-TYPED_TEST(TEST_SUITE_NAME, SingleTile_Persistent)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Regular_Persistent)
-{
-    constexpr int M = 512;
-    constexpr int N = 1024;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_Persistent)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 2048;
-    this->Run(M, N, K);
-}
-
-#undef TEST_SUITE_NAME
-
-#if defined(CK_TILE_USE_WMMA)
-// ============================================================================
-// Persistent Kernel Tests with WMMA
-// ============================================================================
-
-#define TEST_SUITE_NAME TestGemmPersistentWmma
-
-TYPED_TEST(TEST_SUITE_NAME, SmallM_PersistentWmma)
-{
-    std::vector<int> Ms{1, 2};
-    constexpr int N = 1024;
-    std::vector<int> Ks;
-    for(auto K_count : {2, 4})
-    {
-        Ks.push_back(K_count * TestFixture::K_Tile);
-    }
-
-    for(int M : Ms)
-    {
-        for(int K : Ks)
-        {
-            if constexpr(std::is_same_v<typename TestFixture::ALayout,
-                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
-            {
-                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-            }
-            else
-            {
-                this->Run(M, N, K);
-            }
-        }
-    }
-}
-
-TYPED_TEST(TEST_SUITE_NAME, SingleTile_PersistentWmma)
-{
-    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, Regular_PersistentWmma)
-{
-    constexpr int M = 512;
-    constexpr int N = 1024;
-    constexpr int K = 512;
-    this->Run(M, N, K);
-}
-
-TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_PersistentWmma)
-{
-    constexpr int M = 2048;
-    constexpr int N = 2048;
-    constexpr int K = 2048;
-    this->Run(M, N, K);
-}
-
-#undef TEST_SUITE_NAME
-#endif // CK_TILE_USE_WMMA
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
index bcb3fc5733..c34374c66f 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
@@ -31,14 +31,7 @@ TYPED_TEST(TEST_SUITE_NAME, SmallM)
             if constexpr(std::is_same_v<typename TestFixture::ALayout,
                                         ck_tile::tensor_layout::gemm::ColumnMajor>)
             {
-                if(M * sizeof(typename TestFixture::ADataType) % 4 == 0) // oob fit dword
-                {
-                    this->Run(M, N, K);
-                }
-                else
-                {
-                    EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-                }
+                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
             }
             else
             {
@@ -91,14 +84,7 @@ TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
                 }
                 else
                 {
-                    if(M * sizeof(typename TestFixture::ADataType) % 4 == 0) // oob fit dword
-                    {
-                        this->Run(M, N, K);
-                    }
-                    else
-                    {
-                        EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
-                    }
+                    EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
                 }
             }
             else
@@ -120,7 +106,18 @@ TYPED_TEST(TEST_SUITE_NAME, PaddK)
 
     for(int M : Ms)
     {
-        this->Run(M, N, K);
+        if constexpr(std::is_same_v<typename TestFixture::BDataType, ck_tile::pk_int4_t>)
+        {
+#if defined(ARCH_GFX12) || defined(ARCH_GFX11)
+            this->Run(M, N, K);
+#else
+            EXPECT_THROW(this->Run(M, N, K), std::runtime_error);
+#endif
+        }
+        else
+        {
+            this->Run(M, N, K);
+        }
     }
 }
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_base.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_base.cpp
index 5e2403f7d1..78dcf1d325 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_base.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_base.cpp
@@ -1,44 +1,24 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using Half        = ck_tile::half_t;
-using PkFP4       = ck_tile::pk_fp4_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
-
-// 1d block sizes for AQuant
-using GroupSize1D = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-
-// 2d block sizes for BQuant
-using GroupSize2D = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-
-// Type combinations for ABQuant tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantTypes = ::testing::Types<
-    // PreshuffleQuant = false && TransposeC = false
-    // RCR layout with RowMajor AQ, ColumnMajor BQ
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkFP4, PkFP4, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize1D, GroupSize2D, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
-{
-    this->run_test_with_validation(1024, 1024, 1024);
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+// Type combinations for ABQuant tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantTypes = ::testing::Types<
+    // PreshuffleQuant = false && TransposeC = false
+    // RCR layout with RowMajor AQ, ColumnMajor BQ
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkFP4, PkFP4, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize1D_128, GroupSize2D, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes);
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_padding.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_padding.cpp
index 1e496d5b64..0c39d9ed2a 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_padding.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_padding.cpp
@@ -1,65 +1,45 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using Half        = ck_tile::half_t;
-using PkFP4       = ck_tile::pk_fp4_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
-
-// 1d block sizes for AQuant
-using GroupSize1D = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-
-// 2d block sizes for BQuant
-using GroupSize2D = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-
-// Type combinations for ABQuant tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantTypes = ::testing::Types<
-    // PreshuffleQuant = false && TransposeC = false
-    // RCR layout with RowMajor AQ, ColumnMajor BQ
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkFP4, PkFP4, float, Half, ABQuantGrouped, GemmConfigPadding, GroupSize1D, GroupSize2D, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes);
-
-// AQuant tests
-
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadK)
-{
-    this->run_test_with_validation(1024, 1024, 832);
-}
-
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadN)
-{
-    this->run_test_with_validation(1024, 832, 1024);
-}
-
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadM)
-{
-    this->run_test_with_validation(832, 1024, 1024);
-}
-
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadMNK)
-{
-    this->run_test_with_validation(832, 832, 832);
-}
-
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadNK)
-{
-    this->run_test_with_validation(1024, 832, 832);
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+// Type combinations for ABQuant tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantTypes = ::testing::Types<
+    // PreshuffleQuant = false && TransposeC = false
+    // RCR layout with RowMajor AQ, ColumnMajor BQ
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkFP4, PkFP4, float, Half, ABQuantGrouped, GemmConfigPadding, GroupSize1D_128, GroupSize2D, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes);
+
+// AQuant tests
+
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadK)
+{
+    this->run_test_with_validation(1024, 1024, 832);
+}
+
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadN)
+{
+    this->run_test_with_validation(1024, 832, 1024);
+}
+
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadM)
+{
+    this->run_test_with_validation(832, 1024, 1024);
+}
+
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadMNK)
+{
+    this->run_test_with_validation(832, 832, 832);
+}
+
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadNK)
+{
+    this->run_test_with_validation(1024, 832, 832);
+}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_preshuffle.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_preshuffle.cpp
index 43051c8d08..3df77fc4fb 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_preshuffle.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_preshuffle.cpp
@@ -1,44 +1,24 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using Half        = ck_tile::half_t;
-using PkFP4       = ck_tile::pk_fp4_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
-
-// 1d block sizes for AQuant
-using GroupSize1D = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-
-// 2d block sizes for BQuant
-using GroupSize2D = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-
-// Type combinations for ABQuant tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantTypes = ::testing::Types<
-    // RCR layout with RowMajor AQ, ColumnMajor BQ
-    // PreshuffleB = true && TransposeC = false
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkFP4, PkFP4, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize1D, GroupSize2D, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
-{
-    this->run_test_with_validation(1024, 1024, 1024);
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+// Type combinations for ABQuant tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantTypes = ::testing::Types<
+    // RCR layout with RowMajor AQ, ColumnMajor BQ
+    // PreshuffleB = true && TransposeC = false
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkFP4, PkFP4, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize1D_128, GroupSize2D, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes);
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_base.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_base.cpp
index 2524f7887f..e97459b892 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_base.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_base.cpp
@@ -1,56 +1,38 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using PkInt4      = ck_tile::pk_int4_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
-using GroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-
-// 2d block sizes for BQuant
-using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-
-// Type combinations for ABQuant tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantTypes = ::testing::Types<
-    // 1D BScales; PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ)
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize, GroupSize, ColumnMajor>,
-    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize, GroupSize, ColumnMajor>,
-    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize, GroupSize, ColumnMajor>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize, GroupSize, ColumnMajor>,
-    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize, GroupSize, ColumnMajor>,
-    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize, GroupSize, ColumnMajor>,
-	
-    // 2D B-scales; PreshuffleQuant = false && TransposeC = true (RCR layout with RowMajor AQ)
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize, GroupSize2D128N, ColumnMajor>,
-    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize, GroupSize2D128N, ColumnMajor>,
-    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize, GroupSize2D128N, ColumnMajor>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize, GroupSize2D128N, ColumnMajor>,
-    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize, GroupSize2D128N, ColumnMajor>,
-    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize, GroupSize2D128N, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
-{
-    this->run_test_with_validation(1024, 1024, 1024);
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+
+// Type combinations for ABQuant tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantTypes = ::testing::Types<
+    // 1D BScales; PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ)
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
+    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
+    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
+    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
+    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigBase, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
+	
+    // 2D B-scales; PreshuffleQuant = false && TransposeC = true (RCR layout with RowMajor AQ)
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize1D_128, GroupSize2D128N, ColumnMajor>,
+    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize1D_128, GroupSize2D128N, ColumnMajor>,
+    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize1D_128, GroupSize2D128N, ColumnMajor>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize1D_128, GroupSize2D128N, ColumnMajor>,
+    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize1D_128, GroupSize2D128N, ColumnMajor>,
+    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigTransposeC, GroupSize1D_128, GroupSize2D128N, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes);
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves.cpp
index baeb93ac0a..746570f30d 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves.cpp
@@ -1,45 +1,27 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using PkInt4      = ck_tile::pk_int4_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
-using GroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-
-// 2d block sizes for BQuant
-using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-#ifdef CK_GFX950_SUPPORT
-// Type combinations for ABQuant tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantEightWavesTypes = ::testing::Types<
-    // PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ)
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigEightWaves, GroupSize, GroupSize2D128N, ColumnMajor>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigEightWaves_PreshuffleB, GroupSize, GroupSize2D128N, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantEightWavesTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
-{
-    this->run_test_with_validation(1024, 1024, 1024);
-}
-#endif
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+#ifdef CK_GFX950_SUPPORT
+// Type combinations for ABQuant tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantEightWavesTypes = ::testing::Types<
+    // PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ)
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigEightWaves, GroupSize1D_128, GroupSize2D128N, ColumnMajor>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigEightWaves_PreshuffleB, GroupSize1D_128, GroupSize2D128N, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantEightWavesTypes);
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
+#endif
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_padding.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_padding.cpp
index 5247a4405d..fe4ec0a428 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_padding.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_padding.cpp
@@ -1,39 +1,22 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using PkInt4      = ck_tile::pk_int4_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
-using GroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-
-// Type combinations for ABQuant padding padding tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantPaddingTypes = ::testing::Types<
-    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigPadding, GroupSize, GroupSize, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant Padding
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPaddingTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
-{
-    this->run_test_with_validation(1024, 832, 832);
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+// Type combinations for ABQuant padding padding tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantPaddingTypes = ::testing::Types<
+    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, ABQuantGrouped, GemmConfigPadding, GroupSize1D_128, GroupSize1D_128, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant Padding
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPaddingTypes);
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 832, 832);
+}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffleQuant.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffleQuant.cpp
index 1b554cc12a..f949fd4e47 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffleQuant.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffleQuant.cpp
@@ -1,43 +1,25 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using PkInt4      = ck_tile::pk_int4_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
-using GroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-
-// 2d block sizes for BQuant
-using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-
-// Type combinations for ABQuant tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantPreshuffleQuantTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize, GroupSize, ColumnMajor>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize, GroupSize2D128N, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
-{
-    this->run_test_with_validation(1024, 1024, 1024);
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+
+// Type combinations for ABQuant tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantPreshuffleQuantTypes = ::testing::Types<
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize1D_128, GroupSize2D128N, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes);
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_2d.cpp
index 7d8b62616e..a940c2fd02 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_2d.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_2d.cpp
@@ -1,47 +1,29 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using PkInt4      = ck_tile::pk_int4_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
-using GroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-
-// 2d block sizes for BQuant
-using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-
-// Type combinations for ABQuant tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantPreshuffleBTypes = ::testing::Types<
-    // 1D B-scales; PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ)
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize, GroupSize, ColumnMajor>, 
-    /// 2D B-scales; PreshuffleQuant = false && TransposeC = true (RCR layout with RowMajor AQ)
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPrefillTransposeC, GroupSize, GroupSize2D128N, ColumnMajor>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize, GroupSize2D128N, ColumnMajor>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleB_ABQuant_Prefill, GroupSize, GroupSize2D128N, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleBTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
-{
-    this->run_test_with_validation(1024, 1024, 1024);
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+
+// Type combinations for ABQuant tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantPreshuffleBTypes = ::testing::Types<
+    // 1D B-scales; PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ)
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize1D_128, GroupSize1D_128, ColumnMajor>, 
+    /// 2D B-scales; PreshuffleQuant = false && TransposeC = true (RCR layout with RowMajor AQ)
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPrefillTransposeC, GroupSize1D_128, GroupSize2D128N, ColumnMajor>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize1D_128, GroupSize2D128N, ColumnMajor>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleB_ABQuant_Prefill, GroupSize1D_128, GroupSize2D128N, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleBTypes);
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp
index 0b845ac16d..51e555479d 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp
@@ -1,43 +1,25 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using PkInt4      = ck_tile::pk_int4_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
-using GroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-
-// 2d block sizes for BQuant
-using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-
-// Type combinations for ABQuant tests
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
-// clang-format off
-using ABQuantPreshuffleQuantTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<false>, GroupSize, GroupSize, ColumnMajor>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<true>, GroupSize, GroupSize2D128N, ColumnMajor>
->;
-// clang-format on
-
-// Test suite for ABQuant
-TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
-{
-    this->run_test_with_validation(1024, 1024, 1024);
-}
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_quant_common.hpp"
+
+using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+
+// Type combinations for ABQuant tests
+// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
+// QuantType, GemmConfig, AQuantGroupSize, BQuantGroupSize, BQLayout>
+// clang-format off
+using ABQuantPreshuffleQuantTypes = ::testing::Types<
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<false>, GroupSize1D_128, GroupSize1D_128, ColumnMajor>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, ABQuantGrouped, GemmConfigPreshuffleBPreshuffleQuantPrefill<true>, GroupSize1D_128, GroupSize2D128N, ColumnMajor>
+>;
+// clang-format on
+
+// Test suite for ABQuant
+TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes);
+
+// AQuant tests
+TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest)
+{
+    this->run_test_with_validation(1024, 1024, 1024);
+}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_decode.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_decode.cpp
index 7732779d7a..7f8fb70f99 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_decode.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_decode.cpp
@@ -1,22 +1,8 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
 using GroupSize1x1x128   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 using GroupSize1x128x128 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_prefill.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_prefill.cpp
index f746983d06..8f58ef7c7f 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_prefill.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_prefill.cpp
@@ -1,22 +1,8 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using ABQuantGrouped =
-    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
 using GroupSize1x1x128   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 using GroupSize1x128x128 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_ccr.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_ccr.cpp
index 0e04f9fc9e..e66cf10ca8 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_ccr.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_ccr.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
 using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for AQuant tests - CCR layout
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
@@ -25,10 +11,10 @@ using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 // clang-format off
 using AQuantBaseCCRTypes = ::testing::Types<
     // CCR layout (ColumnMajor A, ColumnMajor B, RowMajor C with ColumnMajor AQ) - NEW layout support
-    std::tuple<ColumnMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<ColumnMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<ColumnMajor, ColumnMajor, RowMajor, ColumnMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<ColumnMajor, ColumnMajor, RowMajor, ColumnMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigBase, GroupSize>
+    std::tuple<ColumnMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<ColumnMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<ColumnMajor, ColumnMajor, RowMajor, ColumnMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<ColumnMajor, ColumnMajor, RowMajor, ColumnMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rcr.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rcr.cpp
index da32c06304..671c878957 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rcr.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rcr.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
 using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for AQuant tests - RCR layout base configuration
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
@@ -25,10 +11,10 @@ using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 // clang-format off
 using AQuantBaseRCRTypes = ::testing::Types<
     // PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ)
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigBase, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rrr_crr.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rrr_crr.cpp
index 6e90c44764..e3b3c0953a 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rrr_crr.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rrr_crr.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
 using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for AQuant tests - RRR and CRR layouts
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
@@ -25,14 +11,14 @@ using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 // clang-format off
 using AQuantBaseRRRCRRTypes = ::testing::Types<
     // RRR layout (RowMajor A, RowMajor B, RowMajor C with RowMajor AQ)
-    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
+    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, RowMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
 
     // CRR layout (ColumnMajor A, RowMajor B, RowMajor C with RowMajor AQ)
-    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigBase, GroupSize>
+    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<ColumnMajor, RowMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigBase, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp
index a7ab4120a1..1ef57716c9 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp
@@ -1,33 +1,19 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
 using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for AQuant tests - Mem Decode Interwave Configuration
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using AQuantMemDecodeInterwaveTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_intrawave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_intrawave.cpp
index 483138d711..0c908a9d21 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_intrawave.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_intrawave.cpp
@@ -1,33 +1,19 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
 using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for AQuant tests - Mem Decode Intrawave Configuration
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using AQuantMemDecodeIntrawaveTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigDecodeIntrawave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigDecodeIntrawave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigDecodeIntrawave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigDecodeIntrawave, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigDecodeIntrawave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigDecodeIntrawave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigDecodeIntrawave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigDecodeIntrawave, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp
index 7e851d9bd3..fde3ec977b 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp
@@ -1,33 +1,19 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
 using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for AQuant tests - Mem Prefill Interwave Configuration
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using AQuantMemPrefillInterwaveTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_prefill.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_prefill.cpp
index 911af678df..50e882a1d1 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_prefill.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_prefill.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
 using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for AQuant tests - Prefill Configuration
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
@@ -25,9 +11,9 @@ using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 // clang-format off
 using AQuantPrefillTypes = ::testing::Types<
     // RCR layout - with the Prefill BlockTile Config.
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigPrefillIntrawave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigPrefillIntrawave, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigPrefillIntrawave, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigPrefillIntrawave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigPrefillIntrawave, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigPrefillIntrawave, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_preshuffle.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_preshuffle.cpp
index 35d15f9354..2a0876ea82 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_preshuffle.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_preshuffle.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
 using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for AQuant tests - PreshuffleQuant Configurations
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
@@ -25,16 +11,16 @@ using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 // clang-format off
 using AQuantPreshuffleTypes = ::testing::Types<
     // PreshuffleQuant = true && TransposeC = false (with RowMajor AQ - PreshuffleQuant only supports RowMajor)
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigPreshuffleQuant, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigPreshuffleQuant, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigPreshuffleQuant, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigPreshuffleQuant, GroupSize>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigPreshuffleQuant, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigPreshuffleQuant, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigPreshuffleQuant, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigPreshuffleQuant, GroupSize1D_128>,
 
     // PreshuffleQuant = true && TransposeC = true (with RowMajor AQ - PreshuffleQuant only supports RowMajor)
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigPreshuffleQuantTransposeC, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigPreshuffleQuantTransposeC, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigPreshuffleQuantTransposeC, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigPreshuffleQuantTransposeC, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigPreshuffleQuantTransposeC, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigPreshuffleQuantTransposeC, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigPreshuffleQuantTransposeC, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigPreshuffleQuantTransposeC, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_transpose_c.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_transpose_c.cpp
index a2a4c2c38b..5481419a44 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_transpose_c.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_transpose_c.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
 using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for AQuant tests - TransposeC Configuration
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
@@ -25,8 +11,8 @@ using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 // clang-format off
 using AQuantTransposeCTypes = ::testing::Types<
     // PreshuffleQuant = false && TransposeC = true (with RowMajor AQ)
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigTransposeC, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigTransposeC, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigTransposeC, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigTransposeC, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_128.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_128.cpp
index 0e6e40b788..aa4006ec23 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_128.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_128.cpp
@@ -1,23 +1,7 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+#include "test_gemm_quant_common.hpp"
 
 // Type combinations for BQuant tests - 1D GroupSize 128
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
@@ -25,9 +9,9 @@ using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 // clang-format off
 using BQuant1D128Types = ::testing::Types<
     // 1d cases with grouping only on k axis
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8,  FP8,    float, Half, BQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8,  PkInt4, FP8,   Half, BQuantGrouped, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8,  PkInt4, BF8,   Half, BQuantGrouped, GemmConfigBase, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8,  FP8,    float, Half, BQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8,  PkInt4, FP8,   Half, BQuantGrouped, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8,  PkInt4, BF8,   Half, BQuantGrouped, GemmConfigBase, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_64.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_64.cpp
index 1019caf1bc..9f266b37be 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_64.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_64.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize64   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
+using GroupSize64 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
 
 // Type combinations for BQuant tests - 1D GroupSize 64
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_large_n.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_large_n.cpp
index a8b6dcd14b..4cd38b8b26 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_large_n.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_large_n.cpp
@@ -1,22 +1,8 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
 using GroupSize2D128N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
 
 // Type combinations for BQuant tests - 2D Large N (128N)
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_medium_n.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_medium_n.cpp
index 67d52ef874..409e044d41 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_medium_n.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_medium_n.cpp
@@ -1,24 +1,8 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-
-// 2d block sizes for BQuant
 using GroupSize2D32N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 32, 128>>;
 using GroupSize2D64N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 64, 128>>;
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_small_n.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_small_n.cpp
index 865713992d..024c185012 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_small_n.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_small_n.cpp
@@ -1,24 +1,8 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-
-// 2d block sizes for BQuant
 using GroupSize2D8N  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 8, 128>>;
 using GroupSize2D16N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 16, 128>>;
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_128.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_128.cpp
index 94572a80dc..819eb0dafd 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_128.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_128.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using BF8           = ck_tile::bf8_t;
-using BF16          = ck_tile::bf16_t;
-using PkFP4         = ck_tile::pk_fp4_t;
-using E8M0          = ck_tile::e8m0_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize128  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+using GroupSize128 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for BQuant tests - 1D GroupSize 128
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_64.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_64.cpp
index c6d1f0c341..e0022fe785 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_64.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_64.cpp
@@ -1,27 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using FP16          = ck_tile::fp16_t;
-using BF16          = ck_tile::bf16_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using PkFP4         = ck_tile::pk_fp4_t;
-using E8M0          = ck_tile::e8m0_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize64   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
+using GroupSize64 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
 
 // Type combinations for BQuant tests - 1D GroupSize 64
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_crr_1d_128.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_crr_1d_128.cpp
index e8744eb35a..e6dd8e6a20 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_crr_1d_128.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_crr_1d_128.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using BF8           = ck_tile::bf8_t;
-using BF16          = ck_tile::bf16_t;
-using PkFP4         = ck_tile::pk_fp4_t;
-using E8M0          = ck_tile::e8m0_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize128  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+using GroupSize128 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for BQuant tests - 1D GroupSize 128
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_crr_1d_64.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_crr_1d_64.cpp
index dbc1ae7f2a..9194585e80 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_crr_1d_64.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_crr_1d_64.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using BF8           = ck_tile::bf8_t;
-using BF16          = ck_tile::bf16_t;
-using PkFP4         = ck_tile::pk_fp4_t;
-using E8M0          = ck_tile::e8m0_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize64   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
+using GroupSize64 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
 
 // Type combinations for BQuant tests - 1D GroupSize 64
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rcr_1d_128.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rcr_1d_128.cpp
index 7637b8a12a..39e41a9028 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rcr_1d_128.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rcr_1d_128.cpp
@@ -1,25 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using FP16          = ck_tile::fp16_t;
-using BF16          = ck_tile::bf16_t;
-using PkFP4         = ck_tile::pk_fp4_t;
-using E8M0          = ck_tile::e8m0_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize128  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+using GroupSize128 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for BQuant tests - 1D GroupSize 128
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rcr_1d_64.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rcr_1d_64.cpp
index aa960ca16e..5b5718d3fc 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rcr_1d_64.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rcr_1d_64.cpp
@@ -1,25 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using FP16          = ck_tile::fp16_t;
-using BF16          = ck_tile::bf16_t;
-using PkFP4         = ck_tile::pk_fp4_t;
-using E8M0          = ck_tile::e8m0_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize64   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
+using GroupSize64 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
 
 // Type combinations for BQuant tests - 1D GroupSize 64
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rrr_1d_128.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rrr_1d_128.cpp
index f181b432d4..f4847fd89d 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rrr_1d_128.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rrr_1d_128.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using BF8           = ck_tile::bf8_t;
-using BF16          = ck_tile::bf16_t;
-using PkFP4         = ck_tile::pk_fp4_t;
-using E8M0          = ck_tile::e8m0_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize128  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+using GroupSize128 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for BQuant tests - 1D GroupSize 128
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rrr_1d_64.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rrr_1d_64.cpp
index a02136b7db..4601215b0a 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rrr_1d_64.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_rrr_1d_64.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using BF8           = ck_tile::bf8_t;
-using BF16          = ck_tile::bf16_t;
-using PkFP4         = ck_tile::pk_fp4_t;
-using E8M0          = ck_tile::e8m0_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize64   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
+using GroupSize64 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
 
 // Type combinations for BQuant tests - 1D GroupSize 64
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_1d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_1d.cpp
index 661fd5bd33..c7ba2e15ac 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_1d.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_1d.cpp
@@ -1,31 +1,15 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+#include "test_gemm_quant_common.hpp"
 
 // Type combinations for BQuant Preshuffle tests - Decode Config 1D
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using BPreshuffleDecode1DTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleQuantDecode, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleQuantDecode, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleQuantDecode, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleQuantDecode, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_2d.cpp
index fb4020bcd7..54f71f7c49 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_2d.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_2d.cpp
@@ -1,24 +1,8 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-
-// 2d block sizes for BQuant
 using GroupSize2D8N   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 8, 128>>;
 using GroupSize2D16N  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 16, 128>>;
 using GroupSize2D32N  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 32, 128>>;
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_1d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_1d.cpp
index 0d4e4d5f03..a65c3ab1f0 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_1d.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_1d.cpp
@@ -1,33 +1,17 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+#include "test_gemm_quant_common.hpp"
 
 // Type combinations for BQuant Preshuffle tests - Prefill Config 1D
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using BPreshufflePrefill1DTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, BF8, float, Half, BQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, PkInt4, BF8, Half, BQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, BF8, float, Half, BQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, PkInt4, BF8, Half, BQuantGrouped, GemmConfigPreshuffleQuantPrefill, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_2d.cpp
index edc7bcaa09..93da8003ee 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_2d.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_2d.cpp
@@ -1,24 +1,8 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-
-// 2d block sizes for BQuant
 using GroupSize2D8N   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 8, 128>>;
 using GroupSize2D16N  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 16, 128>>;
 using GroupSize2D32N  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 32, 128>>;
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_1d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_1d.cpp
index cf599ebbfd..f23c2f8c41 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_1d.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_1d.cpp
@@ -1,31 +1,15 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+#include "test_gemm_quant_common.hpp"
 
 // Type combinations for BQuant Preshuffle tests - Decode Config 1D
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using BPreshuffleDecode1DTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleBDecode, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleBDecode, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleBDecode, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleBDecode, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_2d.cpp
index 66fb62e67e..cce9833480 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_2d.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_2d.cpp
@@ -1,24 +1,8 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-
-// 2d block sizes for BQuant
 using GroupSize2D8N   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 8, 128>>;
 using GroupSize2D16N  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 16, 128>>;
 using GroupSize2D32N  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 32, 128>>;
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_1d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_1d.cpp
index 3f6dd225d7..1b3025df07 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_1d.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_1d.cpp
@@ -1,33 +1,17 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+#include "test_gemm_quant_common.hpp"
 
 // Type combinations for BQuant Preshuffle tests - Prefill Config 1D
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using BPreshufflePrefill1DTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, BF8, float, Half, BQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, PkInt4, BF8, Half, BQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, BF8, float, Half, BQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, PkInt4, BF8, Half, BQuantGrouped, GemmConfigPreshuffleBPrefill, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_2d.cpp
index ace07a37ae..e4f11e587b 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_2d.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_2d.cpp
@@ -1,24 +1,8 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-
-// 2d block sizes for BQuant
 using GroupSize2D8N   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 8, 128>>;
 using GroupSize2D16N  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 16, 128>>;
 using GroupSize2D32N  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 32, 128>>;
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_tiled_permute.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_tiled_permute.cpp
index 8a05f5812a..8a54bf05f6 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_tiled_permute.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_tiled_permute.cpp
@@ -1,32 +1,16 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+#include "test_gemm_quant_common.hpp"
 
 // Type combinations for BQuant Preshuffle tests - TiledPermuteN Config
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using BPreshuffleTiledPermuteTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleBPrefillTiledPermuteN, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleBPrefillTiledPermuteN, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, PkInt4, BF8, Half, BQuantGrouped, GemmConfigPreshuffleBPrefillTiledPermuteN, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, FP8, float, Half, BQuantGrouped, GemmConfigPreshuffleBPrefillTiledPermuteN, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, FP8, PkInt4, FP8, Half, BQuantGrouped, GemmConfigPreshuffleBPrefillTiledPermuteN, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, ColumnMajor, BF8, PkInt4, BF8, Half, BQuantGrouped, GemmConfigPreshuffleBPrefillTiledPermuteN, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp
index ea1a8a1fbb..7ab7d22dc7 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize128  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+using GroupSize128 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for BQuant split-K tests - Decode shape, GroupSize 128
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp
index f4f93dbbb6..c076f89e59 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize128  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+using GroupSize128 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
 
 // Type combinations for BQuant split-K tests - Prefill shape, GroupSize 128
 // Tuple format: <ALayout, BLayout, CLayout, BQLayout, ADataType, BDataType, QDataType, CDataType,
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_transpose.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_transpose.cpp
index 230dd8f0fc..a75d80ba27 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_transpose.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_transpose.cpp
@@ -1,23 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
+#include "test_gemm_quant_common.hpp"
 
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor      = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor   = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8           = ck_tile::fp8_t;
-using BF8           = ck_tile::bf8_t;
-using Half          = ck_tile::half_t;
-using PkInt4        = ck_tile::pk_int4_t;
-using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-using GroupSize64   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
+using GroupSize64    = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
 using GroupSize2D64N = ck_tile::QuantGroupShape<ck_tile::sequence<1, 64, 128>>;
 
 // Type combinations for BQuant tests - Transpose Layouts
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_common.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_common.hpp
new file mode 100644
index 0000000000..167e4afc8c
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_common.hpp
@@ -0,0 +1,40 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+// Common includes for all gemm quant tests
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "test_gemm_quant_fixtures.hpp"
+
+// Common layout aliases
+using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
+using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// Common data type aliases
+using Half   = ck_tile::half_t;
+using FP16   = ck_tile::fp16_t;
+using BF16   = ck_tile::bf16_t;
+using FP8    = ck_tile::fp8_t;
+using BF8    = ck_tile::bf8_t;
+using E8M0   = ck_tile::e8m0_t;
+using PkInt4 = ck_tile::pk_int4_t;
+using PkFP4  = ck_tile::pk_fp4_t;
+
+// Common quant type aliases
+using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
+using BQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
+using ABQuantGrouped =
+    std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::ABQuantGrouped>;
+using RowColQuant = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::RowColQuant>;
+using TensorQuant = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::TensorQuant>;
+
+// Common group size aliases
+using GroupSize1D_128 = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+using GroupSize1D_64  = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
+using GroupSize2D     = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_rowcol.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_rowcol.cpp
index bb0fa21899..4e93bdf692 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_rowcol.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_rowcol.cpp
@@ -1,30 +1,15 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using RowColQuant = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::RowColQuant>;
-using GroupSize   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+#include "test_gemm_quant_common.hpp"
 
 // Type combinations for RowColQuant tests
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using RowColQuantTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, RowColQuant, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, RowColQuant, GemmConfigBase, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, RowColQuant, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, RowColQuant, GemmConfigBase, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_tensor.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_tensor.cpp
index 8b4c90f8b9..ce7a2552d2 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_tensor.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_tensor.cpp
@@ -1,30 +1,15 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/gemm.hpp"
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "test_gemm_quant_fixtures.hpp"
-
-// Type aliases for readability
-using RowMajor    = ck_tile::tensor_layout::gemm::RowMajor;
-using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Half        = ck_tile::half_t;
-using TensorQuant = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::TensorQuant>;
-using GroupSize   = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+#include "test_gemm_quant_common.hpp"
 
 // Type combinations for TensorQuant tests
 // Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
 // QuantType, GemmConfig, QuantGroupSize>
 // clang-format off
 using TensorQuantTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, TensorQuant, GemmConfigBase, GroupSize>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, TensorQuant, GemmConfigBase, GroupSize>
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, TensorQuant, GemmConfigBase, GroupSize1D_128>,
+    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, TensorQuant, GemmConfigBase, GroupSize1D_128>
 >;
 // clang-format on
 
diff --git a/test/ck_tile/gemm_mx/CMakeLists.txt b/test/ck_tile/gemm_mx/CMakeLists.txt
index 31fb3ef8a5..36d2e455ae 100644
--- a/test/ck_tile/gemm_mx/CMakeLists.txt
+++ b/test/ck_tile/gemm_mx/CMakeLists.txt
@@ -9,8 +9,7 @@ endif()
 if(GPU_TARGETS MATCHES "gfx95")
     add_gtest_executable(test_ck_tile_mx_gemm_fp4 test_mx_gemm_fp4.cpp)
     target_compile_options(test_ck_tile_mx_gemm_fp4 PRIVATE ${TEST_MX_GEMM_COMPILE_OPTIONS})
-    add_gtest_executable(test_ck_tile_mx_gemm_fp6 test_mx_gemm_fp6.cpp)
-    target_compile_options(test_ck_tile_mx_gemm_fp6 PRIVATE ${TEST_MX_GEMM_COMPILE_OPTIONS})
+
     add_gtest_executable(test_ck_tile_mx_gemm_fp8 test_mx_gemm_fp8.cpp)
     target_compile_options(test_ck_tile_mx_gemm_fp8 PRIVATE ${TEST_MX_GEMM_COMPILE_OPTIONS})
 else()
diff --git a/test/ck_tile/gemm_mx/test_mx_gemm_config.hpp b/test/ck_tile/gemm_mx/test_mx_gemm_config.hpp
index 41dc249788..3cce36a85d 100644
--- a/test/ck_tile/gemm_mx/test_mx_gemm_config.hpp
+++ b/test/ck_tile/gemm_mx/test_mx_gemm_config.hpp
@@ -87,13 +87,6 @@ struct MXfp4_GemmConfig16 : MxGemmConfig
     static constexpr ck_tile::index_t K_Tile = 256;
 };
 
-struct MXfp6_GemmConfig16 : MxGemmConfig
-{
-    static constexpr ck_tile::index_t M_Tile = 64;
-    static constexpr ck_tile::index_t N_Tile = 64;
-    static constexpr ck_tile::index_t K_Tile = 256;
-};
-
 struct MXfp8_GemmConfig16 : MxGemmConfig
 {
     static constexpr ck_tile::index_t M_Tile = 64;
diff --git a/test/ck_tile/gemm_mx/test_mx_gemm_fp6.cpp b/test/ck_tile/gemm_mx/test_mx_gemm_fp6.cpp
deleted file mode 100644
index c63f1d9156..0000000000
--- a/test/ck_tile/gemm_mx/test_mx_gemm_fp6.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_mx_gemm_config.hpp"
-#include "test_mx_gemm_util.hpp"
-
-using Row = ck_tile::tensor_layout::gemm::RowMajor;
-using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-using MxFp6Types = ::testing::Types<
-    std::tuple<ck_tile::pk_fp6x16_t, ck_tile::pk_fp6x16_t, MXfp6_GemmConfig16, Row, Col, Row>>;
-
-template <typename TypeParam>
-class TestMxGemmFp6 : public TestMxGemmUtil<std::tuple_element_t<0, TypeParam>,
-                                            std::tuple_element_t<1, TypeParam>,
-                                            std::tuple_element_t<2, TypeParam>,
-                                            std::tuple_element_t<3, TypeParam>,
-                                            std::tuple_element_t<4, TypeParam>,
-                                            std::tuple_element_t<5, TypeParam>>
-{
-};
-
-TYPED_TEST_SUITE(TestMxGemmFp6, MxFp6Types);
-
-TYPED_TEST(TestMxGemmFp6, BasicSizes)
-{
-    this->Run(64, 64, 256);
-    this->Run(128, 128, 256);
-    this->Run(64, 128, 512);
-}
diff --git a/test/ck_tile/gemm_mx/test_mx_gemm_util.hpp b/test/ck_tile/gemm_mx/test_mx_gemm_util.hpp
index cbf2a7ecd7..6e7ddfb5d0 100644
--- a/test/ck_tile/gemm_mx/test_mx_gemm_util.hpp
+++ b/test/ck_tile/gemm_mx/test_mx_gemm_util.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <gtest/gtest.h>
+
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
 #include "ck_tile/host/check_err.hpp"
diff --git a/test/ck_tile/gemm_streamk/CMakeLists.txt b/test/ck_tile/gemm_streamk/CMakeLists.txt
index f6eb33bf76..2c5b3bb04c 100644
--- a/test/ck_tile/gemm_streamk/CMakeLists.txt
+++ b/test/ck_tile/gemm_streamk/CMakeLists.txt
@@ -19,47 +19,93 @@ set(EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4
 if(GPU_TARGETS MATCHES "gfx90a|gfx942|gfx950")
 
     include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR})
- 
+
     #TODO: support all arches
     #TODO: current c-shuffle only supports C layout as R
     add_gtest_executable(test_ck_tile_streamk_tile_partitioner test_streamk_tile_partitioner.cpp)
-    set(STREAMK_EXTENDED_SOURCES
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_persistent_compv3.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_persistent_mem.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_persistent_compv3.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_persistent_mem.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv3.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_nonpersistent_mem.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv3.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_nonpersistent_mem.cpp
-        test_gemm_streamk_util.cpp)
 
-    # We only test fp8 and bf8 on gfx942 and gfx950 since these types are not natively supported on gfx90a
-    if(GPU_TARGETS MATCHES "gfx942|gfx950")
-        list(APPEND STREAMK_EXTENDED_SOURCES
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_persistent_compv3.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_persistent_mem.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_persistent_compv3.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_persistent_mem.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv3.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_nonpersistent_mem.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv3.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_nonpersistent_mem.cpp)
-    endif()
+    # ---- Code-generate test .cpp files from types header ----
+    set(STREAMK_TYPES_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/test_gemm_streamk_types.hpp)
+    set(STREAMK_GEN_SCRIPT   ${CMAKE_CURRENT_SOURCE_DIR}/generate_test_files.py)
 
-    add_gtest_executable(test_ck_tile_streamk_extended ${STREAMK_EXTENDED_SOURCES})
-    target_compile_options(test_ck_tile_streamk_extended PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    # Re-run configure automatically if the types header changes (e.g. types added/removed)
+    # or if the generation script changes
+    set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${STREAMK_TYPES_HEADER} ${STREAMK_GEN_SCRIPT})
+
+    # Define the targets and their corresponding executable names
+    set(STREAMK_GEN_TARGETS extended atomic_smoke linear_smoke tree_smoke pipelines_smoke)
+    set(STREAMK_GEN_EXEC_EXTENDED         test_ck_tile_streamk_extended)
+    set(STREAMK_GEN_EXEC_ATOMIC_SMOKE     test_ck_tile_streamk_atomic_smoke)
+    set(STREAMK_GEN_EXEC_LINEAR_SMOKE     test_ck_tile_streamk_linear_smoke)
+    set(STREAMK_GEN_EXEC_TREE_SMOKE       test_ck_tile_streamk_tree_smoke)
+    set(STREAMK_GEN_EXEC_PIPELINES_SMOKE  test_ck_tile_streamk_pipelines_smoke)
 
     # Collect all test targets for umbrella label
     set(CK_TILE_GEMM_STREAMK_TEST_TARGETS
-        test_ck_tile_streamk_tile_partitioner
-        test_ck_tile_streamk_extended
+        test_ck_tile_streamk_tile_partitioner)
+
+    foreach(target IN LISTS STREAMK_GEN_TARGETS)
+        string(TOUPPER ${target} TARGET_UPPER)
+        set(GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/${target})
+        set(EXEC_NAME ${STREAMK_GEN_EXEC_${TARGET_UPPER}})
+        set(LIST_FILE ${CMAKE_CURRENT_BINARY_DIR}/${target}_files.txt)
+
+        # Phase 1 (configure time): discover the list of files that will be generated
+        execute_process(
+            COMMAND ${Python3_EXECUTABLE} ${STREAMK_GEN_SCRIPT}
+                --types_header ${STREAMK_TYPES_HEADER}
+                --output_dir ${GEN_DIR}
+                --target ${target}
+                --list_files ${LIST_FILE}
+            RESULT_VARIABLE ret
+            ERROR_VARIABLE list_files_stderr)
+        if(ret AND NOT ret EQUAL 0)
+            message(FATAL_ERROR 
+                "Failed to list ${target} test files via Python: ${ret}\n"
+                "stderr: ${list_files_stderr}"
+            )
+        endif()
+        file(STRINGS ${LIST_FILE} ALL_SOURCES_${target})
+
+        # Phase 2 (build time): generate the .cpp files when the types header changes
+        add_custom_command(
+            OUTPUT ${ALL_SOURCES_${target}}
+            COMMAND ${Python3_EXECUTABLE} ${STREAMK_GEN_SCRIPT}
+                --types_header ${STREAMK_TYPES_HEADER}
+                --output_dir ${GEN_DIR}
+                --target ${target}
+                --gen_files
+            DEPENDS ${STREAMK_TYPES_HEADER} ${STREAMK_GEN_SCRIPT}
+            COMMENT "Generating StreamK ${target} test sources from types header")
+
+        # Filter out fp8/bf8 sources on gfx90a since those types are not natively supported
+        set(FILTERED_SOURCES)
+        foreach(src IN LISTS ALL_SOURCES_${target})
+            if(NOT src MATCHES "_(fp8|bf8)_" OR GPU_TARGETS MATCHES "gfx942|gfx950")
+                list(APPEND FILTERED_SOURCES ${src})
+            endif()
+        endforeach()
+        list(APPEND FILTERED_SOURCES test_gemm_streamk_util.cpp)
+
+        add_gtest_executable(${EXEC_NAME} ${FILTERED_SOURCES})
+        target_compile_options(${EXEC_NAME} PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+        list(APPEND CK_TILE_GEMM_STREAMK_TEST_TARGETS ${EXEC_NAME})
+    endforeach()
+
+    # Add python unit tests to validate the code gen logic in generate_test_files.py
+    add_test(
+        NAME test_ck_tile_streamk_generate_test_files
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_generate_test_files.py -v
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..
     )
 
     # Label all ck_tile gemm_streamk tests with CK_TILE_GEMM_STREAMK_TESTS for selective execution
     foreach(test_target ${CK_TILE_GEMM_STREAMK_TEST_TARGETS})
         set_tests_properties(${test_target} PROPERTIES LABELS "CK_TILE_GEMM_STREAMK_TESTS")
     endforeach()
+    # Also label the Python test
+    set_tests_properties(test_ck_tile_streamk_generate_test_files PROPERTIES LABELS "CK_TILE_GEMM_STREAMK_TESTS")
 
     # Umbrella target to build and run all ck_tile gemm_streamk tests
     # Usage: ninja ck_tile_gemm_streamk_tests
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv3.cpp
deleted file mode 100644
index 2e35690b3d..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv3.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16NonPersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16NonPersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16NonPersistentCompV3,
-                 KernelTypesStreamKBf16NonPersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_mem.cpp
deleted file mode 100644
index ab1dbffcdb..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16NonPersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16NonPersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16NonPersistentMem, KernelTypesStreamKBf16NonPersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_compv3.cpp
deleted file mode 100644
index 24385201a1..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16PersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16PersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16PersistentCompV3, KernelTypesStreamKBf16PersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_mem.cpp
deleted file mode 100644
index 94f9def529..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16PersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16PersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16PersistentMem, KernelTypesStreamKBf16PersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv3.cpp
deleted file mode 100644
index a0a04d79e2..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8NonPersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8NonPersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8NonPersistentCompV3, KernelTypesStreamKBf8NonPersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_mem.cpp
deleted file mode 100644
index 5a6447416d..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8NonPersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8NonPersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8NonPersistentMem, KernelTypesStreamKBf8NonPersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_compv3.cpp
deleted file mode 100644
index 0a6c2346d8..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8PersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8PersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8PersistentCompV3, KernelTypesStreamKBf8PersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_mem.cpp
deleted file mode 100644
index 1eef56c971..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8PersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8PersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8PersistentMem, KernelTypesStreamKBf8PersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv3.cpp
deleted file mode 100644
index 3381554d1e..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv3.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16NonPersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16NonPersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16NonPersistentCompV3,
-                 KernelTypesStreamKFp16NonPersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_mem.cpp
deleted file mode 100644
index 2f7dd7be33..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16NonPersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16NonPersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16NonPersistentMem, KernelTypesStreamKFp16NonPersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_compv3.cpp
deleted file mode 100644
index 3c041a3652..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16PersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16PersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16PersistentCompV3, KernelTypesStreamKFp16PersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_mem.cpp
deleted file mode 100644
index c05135943f..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16PersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16PersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16PersistentMem, KernelTypesStreamKFp16PersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv3.cpp
deleted file mode 100644
index 379702a10a..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8NonPersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8NonPersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8NonPersistentCompV3, KernelTypesStreamKFp8NonPersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_mem.cpp
deleted file mode 100644
index 3d545a61c6..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8NonPersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8NonPersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8NonPersistentMem, KernelTypesStreamKFp8NonPersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_compv3.cpp
deleted file mode 100644
index dccdcaf270..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8PersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8PersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8PersistentCompV3, KernelTypesStreamKFp8PersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_mem.cpp
deleted file mode 100644
index 88ebdf1e55..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8PersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8PersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8PersistentMem, KernelTypesStreamKFp8PersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/generate_test_files.py b/test/ck_tile/gemm_streamk/generate_test_files.py
new file mode 100644
index 0000000000..61a28c2a46
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/generate_test_files.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""Generate test .cpp files from KernelTypes definitions in
+test_gemm_streamk_types.hpp.
+
+Two modes:
+  --list_files FILE   Write the list of output file paths to FILE (one per line)
+                      without generating the files.  Used at CMake configure time.
+  --gen_files         Actually emit the .cpp files into --output_dir.
+                      Used at build time via add_custom_command.
+
+Target selection (--target):
+  extended          Kernel types containing 'Atomic' or 'Pipelines'
+                    -> includes test_gemm_streamk_extended_cases.inc
+  atomic_smoke      Kernel types containing 'Atomic' (not 'Pipelines')
+                    -> includes test_gemm_streamk_atomic_cases.inc
+  linear_smoke      Kernel types containing 'Linear' (not 'Pipelines')
+                    -> includes test_gemm_streamk_reduction_cases.inc
+  tree_smoke        Kernel types containing 'Tree' (not 'Pipelines')
+                    -> includes test_gemm_streamk_reduction_cases.inc
+  pipelines_smoke   Kernel types matching 'Pipelines'
+                    -> includes test_gemm_streamk_reduction_cases.inc
+                       and test_gemm_streamk_atomic_cases.inc
+"""
+
+import argparse
+import os
+import re
+import sys
+
+# --------------------------------------------------------------------------- #
+# Template for every generated .cpp file
+# --------------------------------------------------------------------------- #
+CPP_TEMPLATE = """\
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_streamk_common_includes.hpp"
+
+template <typename Tuple>
+class {class_name} : public TestCkTileStreamK<Tuple>
+{{
+}};
+
+#define TEST_SUITE_NAME {class_name}
+
+TYPED_TEST_SUITE({class_name}, {type_alias});
+
+{inc_includes}
+
+#undef TEST_SUITE_NAME
+"""
+
+# --------------------------------------------------------------------------- #
+# Target definitions: filter predicate and .inc files
+# --------------------------------------------------------------------------- #
+TARGETS = {
+    "extended": {
+        "filter": lambda suffix: "Atomic" in suffix or suffix == "Pipelines",
+        "inc_files": ["test_gemm_streamk_extended_cases.inc"],
+    },
+    "atomic_smoke": {
+        "filter": lambda suffix: "Atomic" in suffix and suffix != "Pipelines",
+        "inc_files": ["test_gemm_streamk_atomic_cases.inc"],
+    },
+    "linear_smoke": {
+        "filter": lambda suffix: "Linear" in suffix and suffix != "Pipelines",
+        "inc_files": ["test_gemm_streamk_reduction_cases.inc"],
+    },
+    "tree_smoke": {
+        "filter": lambda suffix: "Tree" in suffix and suffix != "Pipelines",
+        "inc_files": ["test_gemm_streamk_reduction_cases.inc"],
+    },
+    "pipelines_smoke": {
+        "filter": lambda suffix: suffix == "Pipelines",
+        "inc_files": [
+            "test_gemm_streamk_reduction_cases.inc",
+            "test_gemm_streamk_atomic_cases.inc",
+        ],
+    },
+}
+
+# --------------------------------------------------------------------------- #
+# Mapping from CamelCase suffix fragments to file-name fragments
+# --------------------------------------------------------------------------- #
+KNOWN_TOKENS = [
+    ("Fp16", "fp16"),
+    ("Bf16", "bf16"),
+    ("Fp8", "fp8"),
+    ("Bf8", "bf8"),
+    ("NonPersistent", "nonpersistent"),
+    ("Persistent", "persistent"),
+    ("Atomic", "atomic"),
+    ("Linear", "linear"),
+    ("Tree", "tree"),
+    ("CompV3", "compv3"),
+    ("Pipelines", "pipelines"),
+]
+
+
+def suffix_to_file_tag(suffix: str) -> str:
+    """Convert a CamelCase suffix like 'Fp16PersistentAtomicCompV3' to
+    'fp16_persistent_atomic_compv3'."""
+    parts: list[str] = []
+    remaining = suffix
+    while remaining:
+        matched = False
+        for token, replacement in KNOWN_TOKENS:
+            if remaining.startswith(token):
+                parts.append(replacement)
+                remaining = remaining[len(token) :]
+                matched = True
+                break
+        if not matched:
+            raise ValueError(
+                f"Unrecognised token in KernelTypes suffix: '{remaining}' "
+                f"(from '{suffix}')"
+            )
+    return "_".join(parts)
+
+
+def parse_types_header(header_path: str, target: str) -> list[dict]:
+    """Return a list of dicts with keys: type_alias, class_name, file_tag, suffix."""
+    target_def = TARGETS[target]
+    # Pattern matches lines like: using KernelTypesStreamKFp16PersistentAtomicCompV3 = ...
+    pattern = re.compile(r"using\s+(KernelTypesStreamK(\w+))\s*=")
+    entries: list[dict] = []
+    with open(header_path) as f:
+        for line in f:
+            match = pattern.search(line)
+            if match:
+                # If the match is: using KernelTypesStreamKFp16PersistentAtomicCompV3 = ...
+                # type_alias is KernelTypesStreamKFp16PersistentAtomicCompV3
+                # suffix is Fp16PersistentAtomicCompV3
+                type_alias = match.group(1)
+                suffix = match.group(2)
+                if not target_def["filter"](suffix):
+                    continue
+                entries.append(
+                    {
+                        "type_alias": type_alias,
+                        "class_name": f"TestCkTileStreamK{suffix}",
+                        "file_tag": suffix_to_file_tag(suffix),
+                    }
+                )
+    return entries
+
+
+def output_path(output_dir: str, entry: dict) -> str:
+    return os.path.join(output_dir, f"test_gemm_streamk_{entry['file_tag']}.cpp")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--types_header", required=True, help="Path to test_gemm_streamk_types.hpp"
+    )
+    parser.add_argument(
+        "--output_dir", required=True, help="Directory for generated .cpp files"
+    )
+    parser.add_argument(
+        "--target",
+        required=True,
+        choices=list(TARGETS.keys()),
+        help="Which target to generate files for",
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        "--list_files",
+        metavar="FILE",
+        help="Write output file paths to FILE then exit",
+    )
+    group.add_argument(
+        "--gen_files", action="store_true", help="Generate the .cpp files"
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    entries = parse_types_header(args.types_header, args.target)
+    if not entries:
+        print(
+            f"ERROR: no KernelTypesStreamK* definitions found for target "
+            f"'{args.target}' in {args.types_header}",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    inc_files = TARGETS[args.target]["inc_files"]
+    inc_includes = "\n".join(f'#include "{f}"' for f in inc_files)
+
+    if args.list_files:
+        os.makedirs(os.path.dirname(args.list_files) or ".", exist_ok=True)
+        with open(args.list_files, "w") as f:
+            for entry in entries:
+                f.write(output_path(args.output_dir, entry) + "\n")
+    else:
+        os.makedirs(args.output_dir, exist_ok=True)
+        for entry in entries:
+            path = output_path(args.output_dir, entry)
+            content = CPP_TEMPLATE.format(
+                class_name=entry["class_name"],
+                type_alias=entry["type_alias"],
+                inc_includes=inc_includes,
+            )
+            with open(path, "w") as f:
+                f.write(content)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_atomic_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_atomic_cases.inc
new file mode 100644
index 0000000000..4bd6e9d973
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_atomic_cases.inc
@@ -0,0 +1,47 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_EdgeCase)
+{
+    ck_tile::index_t M = 256;
+    ck_tile::index_t N = 256;
+    ck_tile::index_t K = 256;
+
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_DPOnly)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    // For DP only, we ensure that the number of tiles is a multiple of the number of CUs. This
+    // assumes tile sizes are large enough such that occupancy is 1.
+    ck_tile::index_t M = M_Tile * num_cu;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile;
+
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    // For SK only, we have 4 macro tiles in C. But, we need to make sure there is enough work along
+    // the K dimension to avoid falling into the edge case. Thus, we always have at least num_cu
+    // macro tiles in the K dimension. This assumes tile sizes are large enough such that occupancy
+    // is 1.
+    ck_tile::index_t M = M_Tile * 2;
+    ck_tile::index_t N = N_Tile * 2;
+    ck_tile::index_t K = K_Tile * num_cu;
+
+    this->Run(M, N, K);
+}
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc
new file mode 100644
index 0000000000..e05969c1c7
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc
@@ -0,0 +1,46 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_OneTile)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile * num_cu;
+
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_4Tiles_Reduction)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile * 4;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile * num_cu + (25 * K_Tile);
+
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_21Tiles)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile * 3;
+    ck_tile::index_t N = N_Tile * 7;
+    ck_tile::index_t K = K_Tile * num_cu + (30 * K_Tile);
+
+    this->Run(M, N, K);
+}
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
index ca8ffee219..cbd3f0f066 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
@@ -14,15 +14,28 @@ using BF16 = ck_tile::bf16_t;
 using BF8  = ck_tile::bf8_t;
 using F32  = float;
 
+// Layouts
 using Row = ck_tile::tensor_layout::gemm::RowMajor;
 using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
 
+// Persistence
 using Persistent    = std::true_type;
 using NonPersistent = std::false_type;
 
+// Pipelines
 using Mem    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
 using CompV3 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
+using CompV4 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
 
+// Reduction Strategies
+using Atomic = ck_tile::integral_constant<ck_tile::StreamKReductionStrategy,
+                                          ck_tile::StreamKReductionStrategy::Atomic>;
+using Linear = ck_tile::integral_constant<ck_tile::StreamKReductionStrategy,
+                                          ck_tile::StreamKReductionStrategy::Linear>;
+using Tree   = ck_tile::integral_constant<ck_tile::StreamKReductionStrategy,
+                                          ck_tile::StreamKReductionStrategy::Tree>;
+
+using I16  = ck_tile::number<16>;
 using I32  = ck_tile::number<32>;
 using I128 = ck_tile::number<128>;
 using I256 = ck_tile::number<256>;
@@ -31,120 +44,157 @@ using I256 = ck_tile::number<256>;
 
 // ========================== CompV3 Pipeline ==========================
 
-using KernelTypesStreamKFp16PersistentCompV3 = ::testing::Types<
-//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  Persistent    Pipeline
+// Atomics
+using KernelTypesStreamKFp16PersistentAtomicCompV3 = ::testing::Types<
+//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  M_WaveTile  N_WaveTile  K_WaveTile  Persistent    Pipeline  ReductionStrategy
 
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV3>
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,       Persistent,   CompV3,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,       Persistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,       Persistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,       Persistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKBf16PersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV3>
+using KernelTypesStreamKBf16PersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKBf8PersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV3>
+using KernelTypesStreamKBf8PersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKFp8PersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV3>
+using KernelTypesStreamKFp8PersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKFp16NonPersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV3>
+using KernelTypesStreamKFp16NonPersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKBf16NonPersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV3>
+using KernelTypesStreamKBf16NonPersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKBf8NonPersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV3>
+using KernelTypesStreamKBf8NonPersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKFp8NonPersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV3>
+using KernelTypesStreamKFp8NonPersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>
 >;
 
-// ============================= Mem Pipeline =============================
+// Linear
+using KernelTypesStreamKFp16PersistentLinearCompV3 = ::testing::Types<
+//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  M_WaveTile  N_WaveTile  K_WaveTile  Persistent    Pipeline  ReductionStrategy
 
-using KernelTypesStreamKFp16PersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   Mem>
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I16,       I16,       I16,     Persistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKBf16PersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   Mem>
+using KernelTypesStreamKBf16PersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKBf8PersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   Mem>
+using KernelTypesStreamKBf8PersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKFp8PersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   Mem>
+using KernelTypesStreamKFp8PersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKFp16NonPersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   Mem>
+using KernelTypesStreamKFp16NonPersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKBf16NonPersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   Mem>
+using KernelTypesStreamKBf16NonPersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKBf8NonPersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   Mem>
+using KernelTypesStreamKBf8NonPersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKFp8NonPersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   Mem>
+using KernelTypesStreamKFp8NonPersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>
 >;
 
+// Tree
+using KernelTypesStreamKFp16PersistentTreeCompV3 = ::testing::Types<
+//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  M_WaveTile  N_WaveTile  K_WaveTile  Persistent    Pipeline  ReductionStrategy
+
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I16,       I16,       I16,     Persistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Tree>
+>;
+
+using KernelTypesStreamKBf16PersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>
+>;
+
+using KernelTypesStreamKBf8PersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>
+>;
+
+using KernelTypesStreamKFp8PersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>
+>;
+
+using KernelTypesStreamKFp16NonPersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Tree>
+>;
+
+using KernelTypesStreamKBf16NonPersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>
+>;
+
+using KernelTypesStreamKBf8NonPersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>
+>;
+
+using KernelTypesStreamKFp8NonPersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>
+>;
+
+// ============================= Other Pipelines =============================
+
+using KernelTypesStreamKPipelines = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,      Mem,      Atomic>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   Mem,      Tree>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,      Mem,      Linear>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,      CompV4,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV4,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,      CompV4,   Linear>
+>;
 // clang-format on
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
index af1bab34bf..0d2cfe207a 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
@@ -14,7 +14,8 @@
 enum struct GemmPipelineType
 {
     Mem,
-    CompV3
+    CompV3,
+    CompV4
 };
 
 template <GemmPipelineType PT, typename Problem>
@@ -32,6 +33,12 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompV3, Problem>
     using pipeline = ck_tile::GemmPipelineAgBgCrCompV3<Problem>;
 };
 
+template <typename Problem>
+struct GemmPipelineTypeSelector<GemmPipelineType::CompV4, Problem>
+{
+    using pipeline = ck_tile::GemmPipelineAgBgCrCompV4<Problem>;
+};
+
 template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
                          const ck_tile::index_t kbatch,
@@ -64,23 +71,27 @@ template <typename Tuple>
 class TestCkTileStreamK : public ::testing::Test
 {
     protected:
-    using ALayout                            = std::tuple_element_t<0, Tuple>;
-    using BLayout                            = std::tuple_element_t<1, Tuple>;
-    using CLayout                            = std::tuple_element_t<2, Tuple>;
-    using ADataType                          = std::tuple_element_t<3, Tuple>;
-    using BDataType                          = std::tuple_element_t<4, Tuple>;
-    using AccDataType                        = std::tuple_element_t<5, Tuple>;
-    using CDataType                          = std::tuple_element_t<6, Tuple>;
-    using DsLayout                           = ck_tile::tuple<>;
-    using DsDataType                         = ck_tile::tuple<>;
-    static constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, Tuple>::value;
-    static constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, Tuple>::value;
-    static constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, Tuple>::value;
-    static constexpr bool Persistent         = std::tuple_element_t<10, Tuple>::value;
-    static constexpr auto PipelineType       = std::tuple_element_t<11, Tuple>::value;
+    using ALayout                                 = std::tuple_element_t<0, Tuple>;
+    using BLayout                                 = std::tuple_element_t<1, Tuple>;
+    using CLayout                                 = std::tuple_element_t<2, Tuple>;
+    using ADataType                               = std::tuple_element_t<3, Tuple>;
+    using BDataType                               = std::tuple_element_t<4, Tuple>;
+    using AccDataType                             = std::tuple_element_t<5, Tuple>;
+    using CDataType                               = std::tuple_element_t<6, Tuple>;
+    using DsLayout                                = ck_tile::tuple<>;
+    using DsDataType                              = ck_tile::tuple<>;
+    static constexpr ck_tile::index_t M_Tile      = std::tuple_element_t<7, Tuple>::value;
+    static constexpr ck_tile::index_t N_Tile      = std::tuple_element_t<8, Tuple>::value;
+    static constexpr ck_tile::index_t K_Tile      = std::tuple_element_t<9, Tuple>::value;
+    static constexpr ck_tile::index_t M_Warp_Tile = std::tuple_element_t<10, Tuple>::value;
+    static constexpr ck_tile::index_t N_Warp_Tile = std::tuple_element_t<11, Tuple>::value;
+    static constexpr ck_tile::index_t K_Warp_Tile = std::tuple_element_t<12, Tuple>::value;
 
-    template <ck_tile::StreamKReductionStrategy ReductionStrategy,
-              bool PadM       = true,
+    static constexpr bool Persistent        = std::tuple_element_t<13, Tuple>::value;
+    static constexpr auto PipelineType      = std::tuple_element_t<14, Tuple>::value;
+    static constexpr auto ReductionStrategy = std::tuple_element_t<15, Tuple>::value;
+
+    template <bool PadM       = true,
               bool PadN       = true,
               bool PadK       = true,
               bool Preshuffle = false,
@@ -92,17 +103,13 @@ class TestCkTileStreamK : public ::testing::Test
         constexpr ck_tile::index_t N_Warp = 2;
         constexpr ck_tile::index_t K_Warp = 1;
 
-        constexpr ck_tile::index_t M_Warp_Tile = 32;
-        constexpr ck_tile::index_t N_Warp_Tile = 32;
-        constexpr ck_tile::index_t K_Warp_Tile = 16;
-
         constexpr bool kPadM      = PadM;
         constexpr bool kPadN      = PadN;
         constexpr bool kPadK      = PadK;
         constexpr bool preshuffle = Preshuffle;
 
-        constexpr bool DoubleSmemBuffer   = false;
-        constexpr int kBlockPerCu         = 1;
+        constexpr bool DoubleSmemBuffer = (PipelineType == GemmPipelineType::CompV4) ? true : false;
+        constexpr int kBlockPerCu       = 1;
         constexpr bool StructuredSparsity = false;
         constexpr bool NumWaveGroup       = 1;
 
@@ -262,8 +269,7 @@ class TestCkTileStreamK : public ::testing::Test
                                       stride_C};
 
         ck_tile::index_t num_accumulations_per_tile =
-            invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
-                args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
+            invoke_streamk<>(args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
 
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
 
diff --git a/test/ck_tile/gemm_streamk/test_generate_test_files.py b/test/ck_tile/gemm_streamk/test_generate_test_files.py
new file mode 100644
index 0000000000..7b904da319
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_generate_test_files.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import unittest
+from unittest.mock import mock_open, patch
+from generate_test_files import suffix_to_file_tag, parse_types_header, output_path
+
+# ------------------------------------------------------------ #
+# Unit tests for helper functions in generate_test_files.py
+# ------------------------------------------------------------ #
+
+
+class TestSuffixToFileTag(unittest.TestCase):
+    def test_fp16_token(self):
+        suffix = "Fp16"
+        expected_tag = "fp16"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_bf16_token(self):
+        suffix = "Bf16"
+        expected_tag = "bf16"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_fp8_token(self):
+        suffix = "Fp8"
+        expected_tag = "fp8"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_bf8_token(self):
+        suffix = "Bf8"
+        expected_tag = "bf8"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_nonpersistent_token(self):
+        suffix = "NonPersistent"
+        expected_tag = "nonpersistent"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_persistent_token(self):
+        suffix = "Persistent"
+        expected_tag = "persistent"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_atomic_token(self):
+        suffix = "Atomic"
+        expected_tag = "atomic"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_linear_token(self):
+        suffix = "Linear"
+        expected_tag = "linear"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_tree_token(self):
+        suffix = "Tree"
+        expected_tag = "tree"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_compv3_token(self):
+        suffix = "CompV3"
+        expected_tag = "compv3"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_pipelines_token(self):
+        suffix = "Pipelines"
+        expected_tag = "pipelines"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_unknown_token(self):
+        suffix = "unknown"
+        with self.assertRaises(ValueError):
+            suffix_to_file_tag(suffix)
+
+    def test_multiple_valid_tokens(self):
+        suffix = "Fp16PersistentAtomicCompV3"
+        expected_tag = "fp16_persistent_atomic_compv3"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_multiple_tokens_with_unknown(self):
+        suffix = "Fp16PersistentUnknownCompV3"
+        with self.assertRaises(ValueError):
+            suffix_to_file_tag(suffix)
+
+
+class TestParseTypesHeader(unittest.TestCase):
+    def validate_entries(self, entries, expected_entries):
+        self.assertEqual(len(entries), len(expected_entries))
+        for idx in range(len(entries)):
+            self.assertDictEqual(entries[idx], expected_entries[idx])
+
+    def test_empty_entry(self):
+        """Test that an empty file returns no entries."""
+        mock_content = ""
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "atomic_smoke")
+            self.assertEqual(len(entries), 0)
+
+    def test_pipelines_smoke(self):
+        """Test pipelines_smoke target: matches suffix == 'Pipelines'.
+        Includes: Pipelines
+        Excludes: Fp8NonPersistentTreeCompV3
+        """
+        mock_content = (
+            "using KernelTypesStreamKPipelines = ...\n"
+            "using KernelTypesStreamKFp8NonPersistentTreeCompV3 = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "pipelines_smoke")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKPipelines",
+                    "class_name": "TestCkTileStreamKPipelines",
+                    "file_tag": "pipelines",
+                }
+            ]
+            self.validate_entries(entries, expected)
+
+    def test_extended(self):
+        """Test extended target: matches 'Atomic' in suffix OR suffix == 'Pipelines'.
+        Includes: Fp16PersistentAtomic, Pipelines
+        Excludes: Bf16Linear
+        """
+        mock_content = (
+            "using KernelTypesStreamKFp16PersistentAtomic = ...\n"
+            "using KernelTypesStreamKPipelines = ...\n"
+            "using KernelTypesStreamKBf16Linear = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "extended")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKFp16PersistentAtomic",
+                    "class_name": "TestCkTileStreamKFp16PersistentAtomic",
+                    "file_tag": "fp16_persistent_atomic",
+                },
+                {
+                    "type_alias": "KernelTypesStreamKPipelines",
+                    "class_name": "TestCkTileStreamKPipelines",
+                    "file_tag": "pipelines",
+                },
+            ]
+            self.validate_entries(entries, expected)
+
+    def test_atomic_smoke(self):
+        """Test atomic_smoke target: matches 'Atomic' in suffix AND suffix != 'Pipelines'.
+        Includes: Fp16PersistentAtomic
+        Excludes: Bf16Linear, Pipelines
+        """
+        mock_content = (
+            "using KernelTypesStreamKFp16PersistentAtomic = ...\n"
+            "using KernelTypesStreamKBf16Linear = ...\n"
+            "using KernelTypesStreamKPipelines = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "atomic_smoke")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKFp16PersistentAtomic",
+                    "class_name": "TestCkTileStreamKFp16PersistentAtomic",
+                    "file_tag": "fp16_persistent_atomic",
+                }
+            ]
+            self.validate_entries(entries, expected)
+
+    def test_linear_smoke(self):
+        """Test linear_smoke target: matches 'Linear' in suffix AND suffix != 'Pipelines'.
+        Includes: Fp8NonPersistentLinear
+        Excludes: Bf16PersistentAtomic, Pipelines
+        """
+        mock_content = (
+            "using KernelTypesStreamKFp8NonPersistentLinear = ...\n"
+            "using KernelTypesStreamKBf16PersistentAtomic = ...\n"
+            "using KernelTypesStreamKPipelines = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "linear_smoke")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKFp8NonPersistentLinear",
+                    "class_name": "TestCkTileStreamKFp8NonPersistentLinear",
+                    "file_tag": "fp8_nonpersistent_linear",
+                }
+            ]
+            self.validate_entries(entries, expected)
+
+    def test_tree_smoke(self):
+        """Test tree_smoke target: matches 'Tree' in suffix AND suffix != 'Pipelines'.
+        Includes: Bf8PersistentTreeCompV3
+        Excludes: Fp16Linear, Pipelines
+        """
+        mock_content = (
+            "using KernelTypesStreamKBf8PersistentTreeCompV3 = ...\n"
+            "using KernelTypesStreamKFp16Linear = ...\n"
+            "using KernelTypesStreamKPipelines = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "tree_smoke")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKBf8PersistentTreeCompV3",
+                    "class_name": "TestCkTileStreamKBf8PersistentTreeCompV3",
+                    "file_tag": "bf8_persistent_tree_compv3",
+                }
+            ]
+            self.validate_entries(entries, expected)
+
+
+class TestOutputPath(unittest.TestCase):
+    def test_output_path(self):
+        """Test that output_path generates the correct file path."""
+        entry = {"file_tag": "fp16_persistent_atomic"}
+        output_dir = "/some/output/dir"
+        expected = "/some/output/dir/test_gemm_streamk_fp16_persistent_atomic.cpp"
+        self.assertEqual(output_path(output_dir, entry), expected)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp
index 75c3e0b4fb..c71656cf6b 100644
--- a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp
+++ b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner.cpp
@@ -8,10 +8,10 @@ TEST(StreamKTilePartitionerBaseConstructor, SKOnly)
     using Config = StreamKTilePartitionerBaseConfigSKOnly;
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerBaseExpected expected_values{
-        2, 0, 3, 4, 1, 2, 1, 0, 2, Config::GRID, Config::N};
+        2, 0, 3, 4, 1, 2, 1, 0, 2, Config::MAX_ACTIVE_WGS, Config::N};
     validate_streamk_base_constructor<Config::GemmShape>(expected_values, tile_partitioner);
 }
 
@@ -20,10 +20,10 @@ TEST(StreamKTilePartitionerBaseConstructor, DPOnly)
     using Config = StreamKTilePartitionerBaseConfigDPOnly;
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerBaseExpected expected_values{
-        0, 6, 0, 0, 0, 2, 0, 12, 6, Config::GRID, Config::N};
+        0, 6, 0, 0, 0, 2, 0, 12, 6, Config::MAX_ACTIVE_WGS, Config::N};
     validate_streamk_base_constructor<Config::GemmShape>(expected_values, tile_partitioner);
 }
 
@@ -32,10 +32,10 @@ TEST(StreamKTilePartitionerBaseConstructor, DP2TileSK)
     using Config = StreamKTilePartitionerBaseConfigDP2TileSK;
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerBaseExpected expected_values{
-        4, 3, 3, 8, 2, 2, 2, 6, 7, Config::GRID, Config::N};
+        4, 3, 3, 8, 2, 2, 2, 6, 7, Config::MAX_ACTIVE_WGS, Config::N};
     validate_streamk_base_constructor<Config::GemmShape>(expected_values, tile_partitioner);
 }
 
@@ -44,10 +44,10 @@ TEST(StreamKTilePartitionerBaseConstructor, EdgeCase)
     using Config = StreamKTilePartitionerBaseConfigEdgeCase;
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerBaseExpected expected_values{
-        0, 1, 0, 0, 0, 2, 0, 2, 1, Config::GRID, Config::N};
+        0, 1, 0, 0, 0, 2, 0, 2, 1, Config::MAX_ACTIVE_WGS, Config::N};
     validate_streamk_base_constructor<Config::GemmShape>(expected_values, tile_partitioner);
 }
 
@@ -57,7 +57,7 @@ TEST(StreamKTilePartitionerBaseGetFlagsBufferSize, FlagsLessThan128Bytes)
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape,
                                         ck_tile::StreamKReductionStrategy::Linear>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     EXPECT_EQ(tile_partitioner.get_flags_buffer_size(), 128);
 }
@@ -68,7 +68,7 @@ TEST(StreamKTilePartitionerBaseGetFlagsBufferSize, FlagsEqual128Bytes)
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape,
                                         ck_tile::StreamKReductionStrategy::Linear>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     EXPECT_EQ(tile_partitioner.get_flags_buffer_size(), 128);
 }
@@ -79,7 +79,7 @@ TEST(StreamKTilePartitionerBaseGetFlagsBufferSize, FlagsGreaterThan128Bytes)
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape,
                                         ck_tile::StreamKReductionStrategy::Linear>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     EXPECT_EQ(tile_partitioner.get_flags_buffer_size(), 256);
 }
@@ -89,7 +89,7 @@ TEST(StreamKTilePartitionerBaseGetWorkSpaceSize, AtomicStrategy)
     using Config = StreamKTilePartitionerBaseConfigDP2TileSK;
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     EXPECT_EQ(tile_partitioner.get_workspace_size(sizeof(float)), 0);
 }
@@ -100,12 +100,12 @@ TEST(StreamKTilePartitionerBaseGetWorkSpaceSize, ReductionStrategy)
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape,
                                         ck_tile::StreamKReductionStrategy::Linear>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     ck_tile::index_t expected_partials_size =
-        sizeof(float) * Config::M_TILE * Config::N_TILE * Config::GRID;
-    // Since GRID is 3, the final padded flags array must be 128B to ensure the total byte size of
-    // the flags array is 128B-aligned.
+        sizeof(float) * Config::M_TILE * Config::N_TILE * Config::MAX_ACTIVE_WGS;
+    // Since MAX_ACTIVE_WGS is 3, the final padded flags array must be 128B to ensure the total byte
+    // size of the flags array is 128B-aligned.
     ck_tile::index_t expected_flags_size = 128;
 
     EXPECT_EQ(tile_partitioner.get_workspace_size(sizeof(float)),
@@ -117,7 +117,7 @@ TEST(StreamKTilePartitionerBaseEstimateNumWgsPerTile, EstimateNumWgsPerTileLower
     using Config = StreamKTilePartitionerBaseConfigDP2TileSK;
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     EXPECT_EQ(tile_partitioner.estimate_num_wgs_per_tile(), 2);
 }
@@ -127,7 +127,7 @@ TEST(StreamKTilePartitionerBaseEstimateNumWgsPerTile, EstimateNumWgsPerTileEqual
     using Config = StreamKTilePartitionerBaseConfigSKOnlyWith2WgsPerSKTile;
 
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     EXPECT_EQ(tile_partitioner.estimate_num_wgs_per_tile(), 2);
 }
@@ -232,7 +232,7 @@ TEST(StreamKTilePartitionerBaseGetTileBoundaries, GetTileBoundaries)
 
     // Test parameters
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
     ck_tile::DeviceMem tile_iter_start_dev(sizeof(ck_tile::index_t));
     ck_tile::DeviceMem tile_iter_end_dev(sizeof(ck_tile::index_t));
     ck_tile::index_t tile_idx = 1;
@@ -267,7 +267,7 @@ TEST(StreamKTilePartitionerBaseGetTileIndex, GetTileIndex)
 
     // Test parameters
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
     ck_tile::DeviceMem tile_idx_dev(sizeof(ck_tile::index_t));
     ck_tile::index_t iter_start = 8;
 
@@ -299,7 +299,7 @@ TEST(StreamKTilePartitionerBaseGetIterBoundaries, ZeroExtraItersBeforeMe)
 
     // Test parameters
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
     ck_tile::DeviceMem iter_start_dev(sizeof(ck_tile::index_t));
     ck_tile::DeviceMem iter_end_dev(sizeof(ck_tile::index_t));
     ck_tile::index_t cta_idx = 0;
@@ -333,7 +333,7 @@ TEST(StreamKTilePartitionerBaseGetIterBoundaries, NonZeroExtraItersBeforeMe)
 
     // Test parameters
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
     ck_tile::DeviceMem iter_start_dev(sizeof(ck_tile::index_t));
     ck_tile::DeviceMem iter_end_dev(sizeof(ck_tile::index_t));
     ck_tile::index_t cta_idx = 1;
@@ -367,7 +367,7 @@ TEST(StreamKTilePartitionerBaseGetIterBoundaries, MinIsExtraIters)
 
     // Test parameters
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
     ck_tile::DeviceMem iter_start_dev(sizeof(ck_tile::index_t));
     ck_tile::DeviceMem iter_end_dev(sizeof(ck_tile::index_t));
     ck_tile::index_t cta_idx = 2;
@@ -493,7 +493,7 @@ TEST(StreamKTilePartitioner_PersistentConstructor, SKOnly)
 
     ck_tile::
         StreamKTilePartitioner<Config::GemmShape, ck_tile::StreamKReductionStrategy::Atomic, true>
-            tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+            tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerV2PersistentExpected expected_values{0, 0, 3};
     validate_streamk_persistent<Config::GemmShape>(expected_values, tile_partitioner);
@@ -506,7 +506,7 @@ TEST(StreamKTilePartitioner_PersistentConstructor, DPOnly)
     ck_tile::StreamKTilePartitioner<typename Config::GemmShape,
                                     ck_tile::StreamKReductionStrategy::Atomic,
                                     true>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerV2PersistentExpected expected_values{2, 0, 3};
     validate_streamk_persistent<Config::GemmShape>(expected_values, tile_partitioner);
@@ -519,7 +519,7 @@ TEST(StreamKTilePartitioner_PersistentConstructor, DP2TileSK)
     ck_tile::StreamKTilePartitioner<typename Config::GemmShape,
                                     ck_tile::StreamKReductionStrategy::Atomic,
                                     true>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerV2PersistentExpected expected_values{1, 0, 3};
     validate_streamk_persistent<Config::GemmShape>(expected_values, tile_partitioner);
@@ -532,7 +532,7 @@ TEST(StreamKTilePartitioner_PersistentConstructor, EdgeCase)
     ck_tile::StreamKTilePartitioner<typename Config::GemmShape,
                                     ck_tile::StreamKReductionStrategy::Atomic,
                                     true>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerV2PersistentExpected expected_values{0, 1, 4};
     validate_streamk_persistent<Config::GemmShape>(expected_values, tile_partitioner);
@@ -545,10 +545,10 @@ TEST(StreamKTilePartitioner_GridSize_Persistent, SKOnly)
     ck_tile::StreamKTilePartitioner<typename Config::GemmShape,
                                     ck_tile::StreamKReductionStrategy::Atomic,
                                     true>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     const auto g = tile_partitioner.grid_size();
-    EXPECT_EQ(g.x, Config::GRID);
+    EXPECT_EQ(g.x, Config::MAX_ACTIVE_WGS);
 }
 
 TEST(StreamKTilePartitioner_GridSize_Persistent, EdgeCase)
@@ -558,7 +558,7 @@ TEST(StreamKTilePartitioner_GridSize_Persistent, EdgeCase)
     ck_tile::StreamKTilePartitioner<typename Config::GemmShape,
                                     ck_tile::StreamKReductionStrategy::Atomic,
                                     true>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     const auto g = tile_partitioner.grid_size();
     EXPECT_EQ(g.x, 1);
@@ -571,7 +571,7 @@ TEST(StreamKTilePartitioner_NonPersistentConstructor, SKOnly)
 
     ck_tile::
         StreamKTilePartitioner<Config::GemmShape, ck_tile::StreamKReductionStrategy::Atomic, false>
-            tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+            tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerV2NonPersistentExpected expected_values{0, 0, 0, 3};
     validate_streamk_nonpersistent<Config::GemmShape>(expected_values, tile_partitioner);
@@ -584,7 +584,7 @@ TEST(StreamKTilePartitioner_NonPersistentConstructor, DPOnly)
     ck_tile::StreamKTilePartitioner<typename Config::GemmShape,
                                     ck_tile::StreamKReductionStrategy::Atomic,
                                     false>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerV2NonPersistentExpected expected_values{6, 0, 6, 3};
     validate_streamk_nonpersistent<Config::GemmShape>(expected_values, tile_partitioner);
@@ -597,7 +597,7 @@ TEST(StreamKTilePartitioner_NonPersistentConstructor, DP2TileSK)
     ck_tile::StreamKTilePartitioner<typename Config::GemmShape,
                                     ck_tile::StreamKReductionStrategy::Atomic,
                                     false>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerV2NonPersistentExpected expected_values{3, 0, 3, 3};
     validate_streamk_nonpersistent<Config::GemmShape>(expected_values, tile_partitioner);
@@ -610,7 +610,7 @@ TEST(StreamKTilePartitioner_NonPersistentConstructor, EdgeCase)
     ck_tile::StreamKTilePartitioner<typename Config::GemmShape,
                                     ck_tile::StreamKReductionStrategy::Atomic,
                                     false>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     StreamKTilePartitionerV2NonPersistentExpected expected_values{1, 0, 1, 4};
     validate_streamk_nonpersistent<Config::GemmShape>(expected_values, tile_partitioner);
@@ -623,7 +623,7 @@ TEST(StreamKTilePartitioner_GridSize_NonPersistent, DP2TileSK)
     ck_tile::StreamKTilePartitioner<typename Config::GemmShape,
                                     ck_tile::StreamKReductionStrategy::Atomic,
                                     false>
-        tile_partitioner{Config::M, Config::N, Config::K, Config::GRID};
+        tile_partitioner{Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
 
     const auto g = tile_partitioner.grid_size();
     EXPECT_EQ(g.x, 6);
diff --git a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner_common.hpp b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner_common.hpp
index 31217ba101..6aecd49a3c 100644
--- a/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner_common.hpp
+++ b/test/ck_tile/gemm_streamk/test_streamk_tile_partitioner_common.hpp
@@ -165,7 +165,7 @@ struct StreamKTilePartitionerBaseExpected
     ck_tile::index_t extra_iters_;
     ck_tile::index_t total_dp_iters_;
     ck_tile::index_t num_tiles_;
-    ck_tile::index_t grid_;
+    ck_tile::index_t max_active_wgs_;
     ck_tile::index_t n_;
 };
 
@@ -183,7 +183,7 @@ void validate_streamk_base_constructor(
     EXPECT_EQ(tile_partitioner.get_iters_per_tile(), expected_values.iters_per_tile_);
     EXPECT_EQ(tile_partitioner.get_total_dp_iters(), expected_values.total_dp_iters_);
     EXPECT_EQ(tile_partitioner.get_num_tiles(), expected_values.num_tiles_);
-    EXPECT_EQ(tile_partitioner.get_grid(), expected_values.grid_);
+    EXPECT_EQ(tile_partitioner.get_max_active_wgs(), expected_values.max_active_wgs_);
     EXPECT_EQ(tile_partitioner.get_n(), expected_values.n_);
 }
 
@@ -201,9 +201,9 @@ struct StreamKTilePartitionerBaseConfigDP2TileSK : public StreamKTilePartitioner
     static constexpr ck_tile::index_t M = 28;
     static constexpr ck_tile::index_t N = 4;
     static constexpr ck_tile::index_t K = 16;
-    // The minimum number of bytes needed for the flags array is GRID * 4B = 3 * 4B = 12B. To ensure
-    // the total byte size of the array is 128B-aligned, the flags array must be 128B.
-    static constexpr ck_tile::index_t GRID = 3;
+    // The minimum number of bytes needed for the flags array is MAX_ACTIVE_WGS * 4B = 3 * 4B = 12B.
+    // To ensure the total byte size of the array is 128B-aligned, the flags array must be 128B.
+    static constexpr ck_tile::index_t MAX_ACTIVE_WGS = 3;
 
     static constexpr ck_tile::index_t M_TILE = 4;
     static constexpr ck_tile::index_t N_TILE = 4;
@@ -220,9 +220,9 @@ struct StreamKTilePartitionerBaseConfigFlagsSizeEqual128Bytes
     static constexpr ck_tile::index_t M = 28;
     static constexpr ck_tile::index_t N = 4;
     static constexpr ck_tile::index_t K = 32;
-    // The minimum number of bytes needed for the flags array is GRID * 4B = 32 * 4B = 128B. So, the
-    // number of bytes for the flags array should be 128B.
-    static constexpr ck_tile::index_t GRID = 32;
+    // The minimum number of bytes needed for the flags array is MAX_ACTIVE_WGS * 4B = 32 * 4B =
+    // 128B. So, the number of bytes for the flags array should be 128B.
+    static constexpr ck_tile::index_t MAX_ACTIVE_WGS = 32;
 
     static constexpr ck_tile::index_t M_TILE = 4;
     static constexpr ck_tile::index_t N_TILE = 4;
@@ -239,10 +239,10 @@ struct StreamKTilePartitionerBaseConfigFlagsSizeGreaterThan128Bytes
     static constexpr ck_tile::index_t M = 28;
     static constexpr ck_tile::index_t N = 4;
     static constexpr ck_tile::index_t K = 33;
-    // The minimum number of bytes needed for the flags array is GRID * 4B = 33 * 4B = 132B. So, the
-    // number of bytes for the flags array should be 2 * 128B = 256B to ensure the total byte size
-    // of the array is 128B-aligned.
-    static constexpr ck_tile::index_t GRID = 33;
+    // The minimum number of bytes needed for the flags array is MAX_ACTIVE_WGS * 4B = 33 * 4B =
+    // 132B. So, the number of bytes for the flags array should be 2 * 128B = 256B to ensure the
+    // total byte size of the array is 128B-aligned.
+    static constexpr ck_tile::index_t MAX_ACTIVE_WGS = 33;
 
     static constexpr ck_tile::index_t M_TILE = 4;
     static constexpr ck_tile::index_t N_TILE = 4;
@@ -256,10 +256,10 @@ struct StreamKTilePartitionerBaseConfigFlagsSizeGreaterThan128Bytes
 struct StreamKTilePartitionerBaseConfigSKOnlyWith2WgsPerSKTile
     : public StreamKTilePartitionerBaseConfig
 {
-    static constexpr ck_tile::index_t M    = 16;
-    static constexpr ck_tile::index_t N    = 4;
-    static constexpr ck_tile::index_t K    = 16;
-    static constexpr ck_tile::index_t GRID = 8;
+    static constexpr ck_tile::index_t M              = 16;
+    static constexpr ck_tile::index_t N              = 4;
+    static constexpr ck_tile::index_t K              = 16;
+    static constexpr ck_tile::index_t MAX_ACTIVE_WGS = 8;
 
     static constexpr ck_tile::index_t M_TILE = 4;
     static constexpr ck_tile::index_t N_TILE = 4;
@@ -272,10 +272,10 @@ struct StreamKTilePartitionerBaseConfigSKOnlyWith2WgsPerSKTile
 
 struct StreamKTilePartitionerBaseConfigDPOnly : public StreamKTilePartitionerBaseConfig
 {
-    static constexpr ck_tile::index_t M    = 12;
-    static constexpr ck_tile::index_t N    = 4;
-    static constexpr ck_tile::index_t K    = 16;
-    static constexpr ck_tile::index_t GRID = 3;
+    static constexpr ck_tile::index_t M              = 12;
+    static constexpr ck_tile::index_t N              = 4;
+    static constexpr ck_tile::index_t K              = 16;
+    static constexpr ck_tile::index_t MAX_ACTIVE_WGS = 3;
 
     static constexpr ck_tile::index_t M_TILE = 4;
     static constexpr ck_tile::index_t N_TILE = 2;
@@ -288,10 +288,10 @@ struct StreamKTilePartitionerBaseConfigDPOnly : public StreamKTilePartitionerBas
 
 struct StreamKTilePartitionerBaseConfigSKOnly : public StreamKTilePartitionerBaseConfig
 {
-    static constexpr ck_tile::index_t M    = 4;
-    static constexpr ck_tile::index_t N    = 4;
-    static constexpr ck_tile::index_t K    = 16;
-    static constexpr ck_tile::index_t GRID = 3;
+    static constexpr ck_tile::index_t M              = 4;
+    static constexpr ck_tile::index_t N              = 4;
+    static constexpr ck_tile::index_t K              = 16;
+    static constexpr ck_tile::index_t MAX_ACTIVE_WGS = 3;
 
     static constexpr ck_tile::index_t M_TILE = 4;
     static constexpr ck_tile::index_t N_TILE = 2;
@@ -304,10 +304,10 @@ struct StreamKTilePartitionerBaseConfigSKOnly : public StreamKTilePartitionerBas
 
 struct StreamKTilePartitionerBaseConfigSKOnlyLargeK : public StreamKTilePartitionerBaseConfig
 {
-    static constexpr ck_tile::index_t M    = 8;
-    static constexpr ck_tile::index_t N    = 2;
-    static constexpr ck_tile::index_t K    = 10;
-    static constexpr ck_tile::index_t GRID = 5;
+    static constexpr ck_tile::index_t M              = 8;
+    static constexpr ck_tile::index_t N              = 2;
+    static constexpr ck_tile::index_t K              = 10;
+    static constexpr ck_tile::index_t MAX_ACTIVE_WGS = 5;
 
     static constexpr ck_tile::index_t M_TILE = 4;
     static constexpr ck_tile::index_t N_TILE = 2;
@@ -321,10 +321,10 @@ struct StreamKTilePartitionerBaseConfigSKOnlyLargeK : public StreamKTilePartitio
 struct StreamKTilePartitionerBaseConfigEdgeCase : public StreamKTilePartitionerBaseConfig
 {
 
-    static constexpr ck_tile::index_t M    = 4;
-    static constexpr ck_tile::index_t N    = 4;
-    static constexpr ck_tile::index_t K    = 16;
-    static constexpr ck_tile::index_t GRID = 4;
+    static constexpr ck_tile::index_t M              = 4;
+    static constexpr ck_tile::index_t N              = 4;
+    static constexpr ck_tile::index_t K              = 16;
+    static constexpr ck_tile::index_t MAX_ACTIVE_WGS = 4;
 
     static constexpr ck_tile::index_t M_TILE = 4;
     static constexpr ck_tile::index_t N_TILE = 4;
@@ -340,10 +340,10 @@ struct StreamKTilePartitionerBaseConfigLargerCTensor : public StreamKTilePartiti
     // This config has 3 macro tiles in the M dimension and 4 macro tiles in the N dimension.
     // This facilitates testing the get_output_tile_index method.
 
-    static constexpr ck_tile::index_t M    = 12;
-    static constexpr ck_tile::index_t N    = 16;
-    static constexpr ck_tile::index_t K    = 16;
-    static constexpr ck_tile::index_t GRID = 4;
+    static constexpr ck_tile::index_t M              = 12;
+    static constexpr ck_tile::index_t N              = 16;
+    static constexpr ck_tile::index_t K              = 16;
+    static constexpr ck_tile::index_t MAX_ACTIVE_WGS = 4;
 
     static constexpr ck_tile::index_t M_TILE = 4;
     static constexpr ck_tile::index_t N_TILE = 4;
@@ -366,7 +366,7 @@ void test_get_output_tile_index(ck_tile::index_t tile_idx,
 
     // Test parameters
     ck_tile::StreamKTilePartitionerBase<Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
     ck_tile::DeviceMem im_dev(sizeof(ck_tile::index_t));
     ck_tile::DeviceMem in_dev(sizeof(ck_tile::index_t));
 
@@ -402,7 +402,7 @@ void test_get_tile_local_cta_idx(ck_tile::index_t tile_iter_start,
 
     // Test parameters
     ck_tile::StreamKTilePartitionerBase<typename Config::GemmShape> tile_partitioner{
-        Config::M, Config::N, Config::K, Config::GRID};
+        Config::M, Config::N, Config::K, Config::MAX_ACTIVE_WGS};
     ck_tile::DeviceMem tile_local_cta_idx_dev(sizeof(ck_tile::index_t));
 
     // Launch kernel
@@ -426,7 +426,7 @@ struct StreamKTilePartitionerV2PersistentExpected
 {
     ck_tile::index_t dp_tiles_per_cta_;
     ck_tile::index_t extra_dp_tiles_;
-    ck_tile::index_t grid_;
+    ck_tile::index_t max_active_wgs_;
 };
 
 struct StreamKTilePartitionerV2NonPersistentExpected
@@ -434,7 +434,7 @@ struct StreamKTilePartitionerV2NonPersistentExpected
     ck_tile::index_t dp_ctas_;
     ck_tile::index_t dp_start_block_idx_;
     ck_tile::index_t sk_start_block_idx_;
-    ck_tile::index_t grid_;
+    ck_tile::index_t max_active_wgs_;
 };
 
 // Persistent
@@ -446,7 +446,7 @@ void validate_streamk_persistent(
 {
     EXPECT_EQ(tile_partitioner.get_dp_tiles_per_cta(), expected_values.dp_tiles_per_cta_);
     EXPECT_EQ(tile_partitioner.get_extra_dp_tiles(), expected_values.extra_dp_tiles_);
-    EXPECT_EQ(tile_partitioner.get_grid(), expected_values.grid_);
+    EXPECT_EQ(tile_partitioner.get_max_active_wgs(), expected_values.max_active_wgs_);
 }
 
 // Non-Persistent
@@ -459,5 +459,5 @@ void validate_streamk_nonpersistent(
     EXPECT_EQ(tile_partitioner.get_dp_ctas(), expected_values.dp_ctas_);
     EXPECT_EQ(tile_partitioner.get_dp_start_block_idx(), expected_values.dp_start_block_idx_);
     EXPECT_EQ(tile_partitioner.get_sk_start_block_idx(), expected_values.sk_start_block_idx_);
-    EXPECT_EQ(tile_partitioner.get_grid(), expected_values.grid_);
+    EXPECT_EQ(tile_partitioner.get_max_active_wgs(), expected_values.max_active_wgs_);
 }
diff --git a/test/ck_tile/gemm_streamk_tile_engine/CMakeLists.txt b/test/ck_tile/gemm_streamk_tile_engine/CMakeLists.txt
deleted file mode 100644
index aa1a2d2d1c..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/CMakeLists.txt
+++ /dev/null
@@ -1,320 +0,0 @@
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-include(generate_configs.cmake) 
-
-# ============================================================================
-# GEMM Tile Engine Unit Tests
-# 
-# This CMake file creates unit tests for tile_engine generated GEMM kernels.
-# It follows the exact same build patterns as tile_engine for consistency
-# and reliability. Each kernel configuration gets its own test executable.
-# ============================================================================
-
-# Locate tile_engine GEMM scripts directory
-set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm_streamk")
-
-if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR})
-    message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}")
-    return()
-endif()
-
-# ============================================================================
-# create_individual_gemm_test_target
-#
-# Creates a single test executable for a specific kernel configuration.
-# Mirrors tile_engine's create_individual_gemm_target function for consistency.
-#
-# Parameters:
-#   datatype     - Data type (fp16, bf16, fp32, etc.)
-#   layout       - Matrix layout (rcr, rrr, ccr, crr)
-#   config_name  - Configuration file name without .json extension
-#   trait        - Kernel trait combination string
-#   tile_config  - Tile configuration parameters
-#   config_json  - Full path to JSON configuration file
-# ============================================================================
-function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json)
-    set(target_name "test_gemm_streamk_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
-    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
-    
-    # Generated header path (already created during cmake configuration)
-    set(test_header "${working_path}/gemm_streamk_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
-    set(test_params_header "${working_path}/test_params.hpp")
-    
-    # Verify header exists (should have been generated during cmake configuration)
-    if(NOT EXISTS ${test_header})
-        message(WARNING "Generated header not found: ${test_header}")
-        return()
-    endif()
-
-    # Verify test parameters header exists
-    if(NOT EXISTS ${test_params_header})
-        message(WARNING "Test parameters header not found: ${test_params_header}")
-        return()
-    endif()
-
-
-    # Create GTest executable for this kernel configuration
-    add_gtest_executable(${target_name}
-        ${CMAKE_CURRENT_SOURCE_DIR}/test_gemm_streamk_simple.cpp
-    )
-
-    # Configure GPU architectures for HIP compilation
-    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${GEMM_TEST_GPU_TARGETS})
-
-    # Define preprocessor macros for generated header location and test parameters
-    target_compile_definitions(${target_name} PRIVATE
-        GEMM_SINGLE_INSTANCE_HPP="${test_header}"
-        GEMM_TEST_PARAMS_HPP="${test_params_header}"
-    )
-
-    # Include directories for headers and dependencies
-    target_include_directories(${target_name} PRIVATE
-        ${PROJECT_SOURCE_DIR}/include
-        ${PROJECT_BINARY_DIR}/include
-        ${PROJECT_SOURCE_DIR}  # Root directory for tile_engine access
-        ${GTEST_INCLUDE_DIRS}
-    )
-
-    # Compiler options matching tile_engine requirements
-    target_compile_options(${target_name} PRIVATE
-        -Wno-undefined-func-template  # Suppress template warnings
-        -Wno-float-equal              # Allow floating point comparisons
-        --offload-compress            # Enable GPU code compression
-        -include ${test_header}       # Auto-include generated header
-    )
-
-    # Add FP8 format definitions for proper data type interpretation
-    if(CK_USE_OCP_FP8)
-        target_compile_options(${target_name} PRIVATE -DCK_TILE_USE_OCP_FP8)
-    endif()
-
-    message(DEBUG "  Created test target: ${target_name}")
-endfunction()
-
-# ============================================================================
-# build_gemm_test_targets
-#
-# Builds all test targets for a specific datatype/layout/config combination.
-# Uses tile_engine's two-step process: list kernels, then generate tests.
-#
-# Parameters:
-#   datatype     - Data type (fp16, bf16, fp32, etc.)
-#   layout       - Matrix layout (rcr, rrr, ccr, crr)
-#   config_name  - Configuration file name without .json extension
-# ============================================================================
-function(build_gemm_test_targets datatype layout config_name configs_dir_path)
-    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
-
-    # Locate and validate configuration file
-    set(config_filename "${config_name}.json")
-    set(json_blob "${configs_dir_path}/${config_filename}")
-
-    if(NOT EXISTS ${json_blob})
-        message(WARNING "Test config file not found: ${json_blob}")
-        return()
-    endif()
-
-    # Prepare build directory for this configuration
-    file(MAKE_DIRECTORY ${working_path})
-
-    # STEP 1: Discovery phase - list all valid kernel configurations
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_streamk_instance_builder.py
-                --working_path ${working_path}
-                --datatype ${datatype}
-                --layout ${layout}
-                --config_json ${json_blob}
-                --list_kernels
-                --gpu_targets "${SUPPORTED_GPU_TARGETS}"
-        WORKING_DIRECTORY ${TILE_ENGINE_GEMM_DIR}
-        RESULT_VARIABLE ret
-        OUTPUT_VARIABLE list_output
-        ERROR_VARIABLE list_error
-    )
-
-    if(NOT ret EQUAL 0)
-        message(WARNING "Failed to list kernels for ${datatype}_${layout}_${config_name}: ${list_error}")
-        return()
-    endif()
-
-    # Verify kernel list file was generated
-    if(NOT EXISTS ${working_path}/gemm_kernel_list.txt)
-        message(DEBUG "No kernels found for ${datatype}_${layout}_${config_name} (validation filtered out all combinations)")
-        return()
-    endif()
-
-    message(DEBUG "Building tests for ${datatype}_${layout}_${config_name}")
-
-    # STEP 2a: Extract test parameters from config
-    set(test_params_file "${working_path}/test_params.hpp")
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_SOURCE_DIR}/extract_test_params.py
-                --config_file ${json_blob}
-                --output_file ${test_params_file}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        RESULT_VARIABLE extract_ret
-        OUTPUT_VARIABLE extract_output
-        ERROR_VARIABLE extract_error
-    )
-
-    if(NOT extract_ret EQUAL 0)
-        message(WARNING "Failed to extract test parameters for ${datatype}_${layout}: ${extract_error}")
-        return()
-    endif()
-
-    # STEP 2b: Header generation phase - generate headers using --gen_single
-    message(STATUS "  Generating headers using --gen_single...")
-    
-    file(STRINGS ${working_path}/gemm_kernel_list.txt kernel_lines)
-    set(gen_count 0)
-    
-    foreach(line IN LISTS kernel_lines)
-                # Parse kernel specification format: kernel_name|tile_config|trait_combo
-                string(REPLACE "|" ";" parts "${line}")
-                list(LENGTH parts parts_len)
-                if(parts_len EQUAL 3)
-                    list(GET parts 0 kernel_name)
-                    list(GET parts 1 tile_config)
-                    list(GET parts 2 trait_combo)
-                    
-                    # Generate header using --gen_single
-                    execute_process(
-                        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_streamk_instance_builder.py
-                                --working_path ${working_path}
-                                --datatype ${datatype}
-                                --layout ${layout}
-                                --config_json ${json_blob}
-                                --gen_single
-                                --kernel_name "${kernel_name}"
-                                --tile_config "${tile_config}"
-                                --trait_combo "${trait_combo}"
-                                --gpu_targets "${SUPPORTED_GPU_TARGETS}"
-                        WORKING_DIRECTORY ${TILE_ENGINE_GEMM_DIR}
-                        RESULT_VARIABLE gen_ret
-                        OUTPUT_VARIABLE gen_output
-                        ERROR_VARIABLE gen_error
-                    )
-                    
-                    if(NOT gen_ret EQUAL 0)
-                        message(WARNING "Failed to generate header for ${kernel_name}: ${gen_error}")
-                    else()
-                    math(EXPR gen_count "${gen_count} + 1")
-                endif()
-            endif()
-        endforeach()
-        
-        message(STATUS "  Generated ${gen_count} headers for ${datatype}_${layout}")
-
-    # STEP 3: Target creation phase - create test targets
-    message(STATUS "  Creating test targets...")
-    file(STRINGS ${working_path}/gemm_kernel_list.txt kernel_lines)
-    set(test_count 0)
-    foreach(line IN LISTS kernel_lines)
-                # Parse kernel specification format: kernel_name|tile_config|trait_combo
-                string(REPLACE "|" ";" parts "${line}")
-                list(LENGTH parts parts_len)
-                if(parts_len EQUAL 3)
-                    list(GET parts 0 kernel_name)
-                    list(GET parts 1 tile_config)
-                    list(GET parts 2 trait_combo)
-
-                # Generate test target for this kernel configuration
-                create_individual_gemm_test_target("${datatype}" "${layout}" "${config_name}" "${trait_combo}" "${tile_config}" "${json_blob}")
-                math(EXPR test_count "${test_count} + 1")
-            endif()
-        endforeach()
-        message(STATUS "  Created ${test_count} test targets for ${datatype}_${layout}")
-endfunction()# ============================================================================
-# MAIN EXECUTION - Test Target Generation
-# ============================================================================
-
-message(STATUS "=== Starting StreamK GEMM Tile Engine Test Configuration ===")
-message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
-
-# GPU architecture filtering - only build tests for supported architectures
-set(GEMM_TEST_GPU_TARGETS "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
-
-foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
-    if(target IN_LIST DESIRED_TARGETS)
-        list(APPEND GEMM_TEST_GPU_TARGETS ${target})
-        message(STATUS "  Adding GPU target for tests: ${target}")
-    endif()
-endforeach()
-
-# Early exit if no compatible GPU architectures are available
-if(NOT GEMM_TEST_GPU_TARGETS)
-	message(WARNING "Skipping StreamK GEMM Tile Engine tests: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
-    return()
-endif()
-
-message(STATUS "Building StreamK GEMM tile engine tests for GPU targets: ${GEMM_TEST_GPU_TARGETS}")
-
-    # Enable parallel compilation optimizations
-    # Set up job pools for better parallel compilation control
-    set_property(GLOBAL PROPERTY JOB_POOLS
-        compile_heavy=4    # Limit heavy compilations to prevent OOM
-        compile_normal=16  # Allow more parallel normal compilations
-    )
-
-    # Enable compiler cache if available and explicitly requested
-    # Disabled by default due to permission issues in CI environments
-    option(ENABLE_CCACHE_TESTS "Enable ccache for test compilation" OFF)
-    if(ENABLE_CCACHE_TESTS)
-        find_program(CCACHE_PROGRAM ccache)
-        if(CCACHE_PROGRAM)
-            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-            message(STATUS "Using ccache for faster test compilation")
-        else()
-            message(WARNING "ccache requested but not found")
-        endif()
-    else()
-        message(STATUS "ccache disabled for tests (use -DENABLE_CCACHE_TESTS=ON to enable)")
-    endif()
-
-# ============================================================================
-# Test Configuration Matrix - Clean Focused Design
-# ============================================================================
-
-# All supported data types and layouts for comprehensive testing
-# Note: fp64 not included (no MFMA hardware support)
-set(TEST_DATATYPES "fp16;bf16")
-set(TEST_LAYOUTS "rcr;rrr;ccr;crr")
-
-# ============================================================================
-# Test Target Generation - Datatype-Specific Categories
-# ============================================================================
-
-# 1. SMOKE TESTS: Test for basic functionality with data types (fp8, bf8, fp16, bf16)
-set(SMALL_DATATYPES "fp16;bf16;fp8;bf8")
-set(SIXTEEN_BIT_DATATYPES "fp16;bf16")
-set(EIGHT_BIT_DATATYPES "fp8;bf8")
-set(LARGE_TILES "256,256,32")
-set(SMALL_TILES "128,128,32")
-set(CONFIG_LIST "")
-set(GENERATED_CONFIG_PATH ${CMAKE_CURRENT_BINARY_DIR}/configs)
-get_cu_count(CU_COUNT)
-
-message(STATUS "Generating and processing configs for Stream-K tests")
-foreach(datatype IN LISTS SMALL_DATATYPES)
-
-    if(datatype IN_LIST SIXTEEN_BIT_DATATYPES)
-        generate_test_configs(${CU_COUNT} ${LARGE_TILES} ${datatype} CONFIG_LIST ${GENERATED_CONFIG_PATH})
-    else()
-        generate_test_configs(${CU_COUNT} ${SMALL_TILES} ${datatype} CONFIG_LIST ${GENERATED_CONFIG_PATH})
-    endif()
-
-    foreach(config IN LISTS CONFIG_LIST)
-    # testing all layouts (rcr, rrr, ccr, crr)
-        foreach(layout IN LISTS TEST_LAYOUTS)
-            build_gemm_test_targets("${datatype}" "${layout}" "${config}" "${GENERATED_CONFIG_PATH}")
-        endforeach()
-    endforeach()
-endforeach()
-
-# ============================================================================
-
-
-message(STATUS "StreamK GEMM tile engine tests configured with datatype-specific design:")
-message(STATUS "  - Smoke tests: fp16/bf16/fp8/bf8 (all layouts)")
diff --git a/test/ck_tile/gemm_streamk_tile_engine/README.md b/test/ck_tile/gemm_streamk_tile_engine/README.md
deleted file mode 100644
index 965342536b..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/README.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# Stream-K GEMM Tile Engine Unit Tests
-
-## How It Works
-
-This unit test system integrates **tile_engine's kernel generation** into automated testing:
-
-1. **Uses tile_engine scripts directly**: Same Python scripts that generate tile_engine kernels
-2. **JSON-based configuration**: Define test parameters in JSON files (like tile_engine)
-3. **Build-time generation**: CMake calls tile_engine scripts to generate kernel headers
-4. **Individual test executables**: Each kernel configuration becomes a separate test
-5. **Tile_engine verification**: Uses exact same error thresholds and validation as tile_engine
-
-## Tile Engine Integration
-
-```
-JSON Config → tile_engine Python scripts → Generated Headers → Test Executables
-```
-
-- **`--list_kernels`**: Get available kernel configurations from JSON
-- **`--gen_individual`**: Generate all kernel headers in parallel during CMake configuration
-- **`--gen_single`**: Generate individual kernel header for each configuration  
-- **Same verification**: Uses tile_engine's adaptive error thresholds and reference calculations
-- **Same patterns**: Follows tile_engine's tensor initialization, stride calculation, and kernel launching
-
-### Config-Specific Test Parameters
-
-Each test configuration can specify optimized problem sizes in its JSON file:
-- **`test_params.problem_sizes`**: Array of `{m, n, k, split_k}` configurations
-- **CMake extraction**: `extract_test_params.py` generates config-specific test parameter files
-- **Build integration**: Each test target uses parameters appropriate for its kernel configuration
-- **Optimized testing**: Different configs test different problem sizes that showcase their strengths
-
-
-The key idea: **Unit tests that use tile_engine's exact kernel generation and verification methodology** instead of creating separate test infrastructure.
-
-## Test Configurations
-Test configs are generated during the Generation Phase. They are stored under the build directory at test/ck_tile/gemm_streamk_tile_engine/configs. The Compute Unit (CU) count of the device is required to generate the configs. If the Generation Phase occurs on a machine without a GPU or does not contain same GPU architecture on which you will run the tests, you can manually set the CU count using the `CU_COUNT` option:
-```bash
-# Assuming you are at the root of the repo
-cd build
-../script/cmake-ck-dev.sh .. gfx90a  -G Ninja -DCU_COUNT=100
-```
-You can reference the public whitepaper for your specific GPU to get the appropriate CU count. 
-If no `CU_COUNT` option is given and no HIP device is found, then the default value of 100 CUs will be used to determine the problem sizes tested.
-
-### 1. **Smoke Tests**
-- **Purpose**: Basic functionality validation for fp16/bf16/fp8/bf8 data types
-- **Config**: 256x256x32 (for bf16/fp16) or 128x128x32 (for bf8/fp8), warp 2x2x1, warp_tile 32x32x16  
-- **Traits**: compv3 pipeline only
-- **Coverage**: All 4 layouts (rcr, rrr, ccr, crr)
-
-## Data Type Support
-- ✅ **fp16, bf16, fp8, bf8**: Fully supported - all layouts (rcr, rrr, ccr, crr)
-- ❌ **fp64**: Not supported (hardware MFMA limitation)
-- ⏳ **fp32, pk-int4-t**: Not yet supported by gemm_instance_builder (will be added later)
-
-## Test Result Behavior
-
-Tests automatically handle unsupported configurations through runtime validation:
-- **PASSED**: Kernel executed correctly with results within error thresholds ✅
-- **SKIPPED**: Kernel validation returned "Arguments not supported" (expected for certain problem sizes/configurations) ⚠️
-- **FAILED**: Actual error or incorrect computation results ❌
-
-When a kernel's `IsSupportedArgument()` check fails (e.g., due to vector alignment requirements, dimension constraints, or padding limitations), the test is automatically skipped rather than failed. This allows comprehensive testing across various problem sizes while gracefully handling configurations that don't meet specific kernel requirements.
diff --git a/test/ck_tile/gemm_streamk_tile_engine/cu_count.cpp b/test/ck_tile/gemm_streamk_tile_engine/cu_count.cpp
deleted file mode 100644
index 6e2857e8a1..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/cu_count.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <hip/hip_runtime.h>
-#include <iostream>
-
-/**
- * @brief Determines whether a `hipError` is present in the given `error_status`
- * @return true if the `error_status` has an error, otherwise false.
- */
-bool has_error(const hipError_t& error_status)
-{
-    if(error_status != hipSuccess)
-    {
-        std::cerr << hipGetErrorString(error_status);
-        return true;
-    }
-
-    return false;
-}
-
-/**
- * @brief Returns the number of Compute Units (CUs) on the given device.
- * @return The number of CUs on the device. If an error occurs while querying the device, zero is
- * returned.
- */
-int get_cu_count()
-{
-    hipDevice_t dev;
-    hipDeviceProp_t dev_prop;
-
-    const hipError_t device_status = hipGetDevice(&dev);
-
-    if(has_error(device_status))
-        return 0;
-
-    const hipError_t prop_status = hipGetDeviceProperties(&dev_prop, dev);
-    if(has_error(prop_status))
-        return 0;
-
-    return dev_prop.multiProcessorCount;
-}
-
-int main()
-{
-
-    std::cout << get_cu_count();
-
-    return 0;
-}
diff --git a/test/ck_tile/gemm_streamk_tile_engine/extract_test_params.py b/test/ck_tile/gemm_streamk_tile_engine/extract_test_params.py
deleted file mode 100644
index 48ec8dba83..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/extract_test_params.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-
-import json
-import argparse
-import os
-from pathlib import Path
-
-
-def extract_test_params(config_file, output_file):
-    """Extract test parameters from config JSON and write to output file"""
-
-    # Read config file
-    with open(config_file, "r") as f:
-        config = json.load(f)
-
-    # Extract test parameters
-    test_params = []
-    if "test_params" in config and "problem_sizes" in config["test_params"]:
-        test_params = config["test_params"]["problem_sizes"]
-    else:
-        # Default test parameters if none specified
-        test_params = [
-            {"m": 256, "n": 256, "k": 128, "split_k": 1},
-            {"m": 256, "n": 256, "k": 1024, "split_k": 1},
-            {"m": 256, "n": 512, "k": 512, "split_k": 1},
-            {"m": 512, "n": 256, "k": 512, "split_k": 1},
-        ]
-
-    # Write to output file in C++ format
-    output_dir = Path(output_file).parent
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    with open(output_file, "w") as f:
-        f.write("// Generated test parameters for this configuration\n")
-        f.write("// This file is auto-generated during CMake configuration\n\n")
-        f.write("static const std::vector<GemmTestParams> CONFIG_TEST_PARAMS = {\n")
-
-        for i, params in enumerate(test_params):
-            comma = "," if i < len(test_params) - 1 else ""
-            f.write(
-                f"    {{{params['m']}, {params['n']}, {params['k']}, {params['split_k']}}}{comma}\n"
-            )
-
-        f.write("};\n")
-
-    print(
-        f"Extracted {len(test_params)} test parameters from {config_file} -> {output_file}"
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Extract test parameters from config JSON"
-    )
-    parser.add_argument("--config_file", required=True, help="Input config JSON file")
-    parser.add_argument(
-        "--output_file", required=True, help="Output test parameters file"
-    )
-
-    args = parser.parse_args()
-
-    if not os.path.exists(args.config_file):
-        print(f"Error: Config file not found: {args.config_file}")
-        return 1
-
-    extract_test_params(args.config_file, args.output_file)
-    return 0
-
-
-if __name__ == "__main__":
-    exit(main())
diff --git a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.cmake b/test/ck_tile/gemm_streamk_tile_engine/generate_configs.cmake
deleted file mode 100644
index 148d57976a..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.cmake
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-set(CU_COUNT 0 CACHE STRING "Number of Compute Units on the device")
-
-# ============================================================================
-# get_cu_count
-#
-# Returns the CU count for the device. If the given cu_count_arg is a positive
-# integer, then the nothing happens. Otherwise, we attempt to query the CU
-# count from the device. If the query is unsucessful, the default value of 100
-# is returned.
-#
-# Parameters:
-#   cu_count_arg  - The starting CU count
-# ============================================================================
-function(get_cu_count cu_count_arg)
-    message(STATUS "Starting query for CU count needed for Stream-K test config generation")
-
-    if(NOT "${${cu_count_arg}}" MATCHES "^[0-9]+$")
-        message(FATAL_ERROR "The CU count must be a non-negative integer. \
-                The given value of ${${cu_count_arg}} is invalid.")
-    endif()
-
-    if("${${cu_count_arg}}" STREQUAL "0")
-
-        set(CPP_FILE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cu_count.cpp)
-        set(CPP_EXE_PATH ${CMAKE_CURRENT_BINARY_DIR}/cu_count)
-
-        execute_process(
-            COMMAND ${CMAKE_HIP_COMPILER} -x hip ${CPP_FILE_PATH} -o ${CPP_EXE_PATH}
-            RESULT_VARIABLE compile_exit_code
-        )
-        
-        if (NOT compile_exit_code EQUAL 0)
-            message(FATAL_ERROR "Compilation of ${CPP_FILE_PATH} failed.\n")
-        endif()
-
-        # Get the HIP library directory
-        get_filename_component(HIP_COMPILER_DIR ${CMAKE_HIP_COMPILER} DIRECTORY)
-        get_filename_component(HIP_ROOT_DIR ${HIP_COMPILER_DIR} DIRECTORY)
-        set(HIP_LIB_DIR "${HIP_ROOT_DIR}/lib")
-
-        # Set library path for runtime execution
-        if(WIN32)
-            set(ENV{PATH} "${HIP_LIB_DIR};$ENV{PATH}")
-        else()
-            set(ENV{LD_LIBRARY_PATH} "${HIP_LIB_DIR}:$ENV{LD_LIBRARY_PATH}")
-        endif()
-
-        execute_process(
-            COMMAND ${CPP_EXE_PATH}
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            ERROR_VARIABLE standard_error
-            OUTPUT_VARIABLE queried_cu_count
-            RESULT_VARIABLE queried_cu_count_exit_code
-        )
-
-        if (standard_error)
-            message(STATUS "Error information from attempting to query HIP device and properties:\n"
-                            "${standard_error}")
-        endif()
-
-        if (NOT queried_cu_count_exit_code EQUAL 0)
-            message(STATUS "Failed to run ${CPP_EXE_PATH} to query the device's CU count")
-
-        endif()
-        
-
-        # Delete the generated cu_count executable
-        file(REMOVE "${CPP_EXE_PATH}")
-
-        if((queried_cu_count STREQUAL "0") OR (NOT queried_cu_count_exit_code EQUAL 0))
-            message(WARNING "Unable to query the number of Compute Units. \
-                    Please use the CU_COUNT CLI option to pass in the \
-                    number of Compute Units for your target device; otherwise, \
-                    the default value of 100 will be used.")
-            set(${cu_count_arg} 100 PARENT_SCOPE)
-        else()
-            set(${cu_count_arg} ${queried_cu_count} PARENT_SCOPE)
-        endif()
-
-    endif()
-
-endfunction()
-
-# ============================================================================
-# generate_test_configs
-#
-# Generate config json files for Stream-K tests
-#
-# Parameters:
-#   cu_count_arg  - The number of CUs on the device
-#   tile_sizes    - A list of block tile sizes: tile_m,tile_n,tile_k
-#   datatype      - The datatype for which the config is being generated
-#   config_list   - The variable to which the list of config file names are written
-#   configs_path   - Path to the configs directory to which config files are written
-# ============================================================================
-function(generate_test_configs cu_count_arg tile_sizes datatype config_list configs_path)
-    message(STATUS "Generating Stream-K test config files for ${datatype}")
-
-    file(MAKE_DIRECTORY ${configs_path})
-
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_SOURCE_DIR}/generate_configs.py 
-                --cu_count ${cu_count_arg} 
-                --configs_dir_path ${configs_path}
-                --tiles ${tile_sizes}
-                --datatype ${datatype}
-        OUTPUT_VARIABLE CONFIG_LIST
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE script_ret_val
-    )
-
-    if (NOT script_ret_val EQUAL 0)
-        message(FATAL_ERROR "Eror occured during execution of ${CMAKE_CURRENT_SOURCE_DIR}/generate_configs.py")
-    endif()
-
-    set(${config_list} ${CONFIG_LIST} PARENT_SCOPE)
-
-endfunction()
diff --git a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py b/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py
deleted file mode 100644
index 0f2673c6dd..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py
+++ /dev/null
@@ -1,277 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-from enum import Enum
-from typing import Dict, Tuple, List
-import argparse
-import json
-import os
-import sys
-from dataclasses import dataclass, field, asdict
-
-
-@dataclass
-class TileConfig:
-    """Represents the Tile Config section of a Tile Engine config"""
-
-    tile_m: List[int] = field(default_factory=list)
-    tile_n: List[int] = field(default_factory=list)
-    tile_k: List[int] = field(default_factory=list)
-    warp_m: List[int] = field(default_factory=lambda: [2])
-    warp_n: List[int] = field(default_factory=lambda: [2])
-    warp_k: List[int] = field(default_factory=lambda: [1])
-    warp_tile_m: List[int] = field(default_factory=lambda: [16, 32])
-    warp_tile_n: List[int] = field(default_factory=lambda: [16, 32])
-    warp_tile_k: List[int] = field(default_factory=lambda: [8, 16, 32])
-
-    def to_dict(self) -> Dict:
-        return {k: {"values": v} for k, v in asdict(self).items()}
-
-
-@dataclass
-class TraitConfig:
-    """Represents the Trait Config section of a Tile Engine config"""
-
-    pipeline: List[str] = field(default_factory=lambda: ["compv3", "mem"])
-    epilogue: List[str] = field(default_factory=lambda: ["cshuffle"])
-    scheduler: List[str] = field(default_factory=lambda: ["intrawave"])
-    pad_m: List[bool] = field(default_factory=lambda: [False])
-    pad_n: List[bool] = field(default_factory=lambda: [False])
-    pad_k: List[bool] = field(default_factory=lambda: [False])
-    persistent: List[bool] = field(default_factory=lambda: [True, False])
-    reduction_strategy: List[str] = field(default_factory=list)
-
-    def to_dict(self) -> Dict:
-        return {k: {"values": v} for k, v in asdict(self).items()}
-
-
-class TestVariant(Enum):
-    """Represents a Stream-K test variant"""
-
-    def __init__(
-        self,
-        val: int,
-        reduction_strategy: List[str],
-        persistent: List[bool],
-        datatypes: List[str],
-        description: str,
-    ):
-        self._value_ = val
-        self.reduction_strategy = reduction_strategy
-        self.persistent = persistent
-        self.datatypes = datatypes
-        self.description = description
-
-    ATOMIC_SMOKE = (
-        0,
-        ["atomic"],
-        [True, False],
-        ["fp16", "bf16", "fp8", "bf8"],
-        "Stream-K atomic smoke tests",
-    )
-    REDUCTION_SMOKE = (
-        2,
-        ["linear", "tree"],
-        [True, False],
-        ["fp16", "bf16", "fp8", "bf8"],
-        "Stream-K reduction smoke tests",
-    )
-    EXTENDED = (
-        3,
-        ["atomic"],
-        [True, False],
-        ["fp16", "bf16", "fp8", "bf8"],
-        "Stream-K extended smoke tests",
-    )
-
-    def apply(self, trait_config: TraitConfig) -> None:
-        """Applies the current test variant's persistent and reduction strategy setting to the given trait_config"""
-        trait_config.persistent = self.persistent
-        trait_config.reduction_strategy = self.reduction_strategy
-
-
-@dataclass
-class ProblemSize:
-    """Represents a problem size in a Tile Engine config"""
-
-    m: int
-    n: int
-    k: int
-    variant: TestVariant
-    split_k: int = 1
-
-    def to_dict(self) -> Dict:
-        return {"m": self.m, "n": self.n, "k": self.k, "split_k": self.split_k}
-
-
-@dataclass
-class Config:
-    """Represents a Tile Engine config"""
-
-    description: str
-    problem_sizes: list[ProblemSize] = field(default_factory=list)
-    tile_config: TileConfig = field(default_factory=TileConfig)
-    trait_config: TraitConfig = field(default_factory=TraitConfig)
-    k_block_per_cu: int = 1
-    permute_n: bool = False
-
-    def add_problem_size(self, problem: ProblemSize) -> None:
-        """Adds the given problem to this config's problem_sizes"""
-        self.problem_sizes.append(problem)
-
-    def to_dict(self) -> Dict:
-        config_dict = {
-            "problem": {"description": f"{self.description}"},
-            "test_params": {
-                "problem_sizes": [ps.to_dict() for ps in self.problem_sizes]
-            },
-            "tile_config": self.tile_config.to_dict(),
-            "trait_config": self.trait_config.to_dict(),
-            "k_block_per_cu": self.k_block_per_cu,
-            "permute_n": self.permute_n,
-        }
-        return config_dict
-
-    def write_to_file(self, output_file: str) -> None:
-        """Writes this configs to the given output_file in a json format"""
-        with open(output_file, "w") as config_file:
-            json.dump(self.to_dict(), config_file, indent=4)
-            config_file.write("\n")
-
-
-def create_problem_sizes(
-    tile_m: int, tile_n: int, tile_k: int, cu_count: int
-) -> List[ProblemSize]:
-    """Creates and returns a list of problem sizes using the given arguments"""
-    problem_sizes = [
-        ProblemSize(256, 256, 256, TestVariant.ATOMIC_SMOKE),
-        ProblemSize(tile_m * cu_count, tile_n, tile_k, TestVariant.ATOMIC_SMOKE),
-        ProblemSize(
-            tile_m * 2, tile_n * 2, cu_count * tile_k, TestVariant.ATOMIC_SMOKE
-        ),
-        ProblemSize(tile_m, tile_n, cu_count * tile_k, TestVariant.REDUCTION_SMOKE),
-        ProblemSize(
-            tile_m * 4,
-            tile_n,
-            tile_k * cu_count + (25 * tile_k),
-            TestVariant.REDUCTION_SMOKE,
-        ),
-        ProblemSize(
-            tile_m * 3,
-            tile_n * 7,
-            tile_k * cu_count + (30 * tile_k),
-            TestVariant.REDUCTION_SMOKE,
-        ),
-        # TODO: Add this test once we determine how to label tests as regresion with tile engine
-        # ProblemSize((tile_m * cu_count * 2) + (tile_m * 2), tile_n, 2048, TestVariant.EXTENDED)
-    ]
-
-    return problem_sizes
-
-
-def write_config_files(
-    problem_sizes: List[ProblemSize],
-    configs_dir_path: str,
-    datatype: str,
-    tile_sizes: Tuple[int, int, int],
-) -> str:
-    """Writes the given problem_sizes to a config file and returns the names of the config files written to"""
-    config_names = []
-    tile_m, tile_n, tile_k = tile_sizes
-    tile_config = TileConfig([tile_m], [tile_n], [tile_k])
-
-    # Create a config for each test variant
-    for variant in TestVariant:
-        problem_sizes_filtered = [ps for ps in problem_sizes if ps.variant == variant]
-
-        if (datatype not in variant.datatypes) or len(problem_sizes_filtered) == 0:
-            continue
-
-        trait_config = TraitConfig()
-        variant.apply(trait_config)
-        config_name = f"streamk_{variant.name.lower()}_tests_config_{datatype}"
-        config_names.append(config_name)
-        file_path = os.path.join(configs_dir_path, config_name + ".json")
-        config = Config(
-            variant.description, problem_sizes_filtered, tile_config, trait_config
-        )
-        config.write_to_file(file_path)
-
-    return config_names
-
-
-def print_config_names(config_file_names: List[str]) -> None:
-    """Prints given config file names as a single semi-colon separated string"""
-    print(";".join(config_file_names))
-
-
-def create_config_files(
-    cu_count: int, configs_dir_path: str, tile_sizes: int, datatype: str
-) -> None:
-    """Creates Stream-K test config files and prints the file names in a semi-colon-separated list"""
-    tile_m, tile_n, tile_k = tile_sizes
-
-    problem_sizes = create_problem_sizes(tile_m, tile_n, tile_k, cu_count)
-    config_names = write_config_files(
-        problem_sizes, configs_dir_path, datatype, tile_sizes
-    )
-    print_config_names(config_names)
-
-
-def get_args() -> Tuple[int, str, Tuple[int, int, int], str]:
-    """Returns user provided arguments"""
-
-    def tile_sizes_type(val: str):
-        sizes = None
-        parts = val.split(",")
-        if len(parts) != 3:
-            raise argparse.ArgumentTypeError(
-                "--tiles must contain exactly three comma-separated values (m,n,k), e.g. --tiles 256,256,32"
-            )
-        try:
-            sizes = tuple(int(size) for size in parts)
-        except ValueError:
-            raise argparse.ArgumentTypeError(
-                "--tiles must contain exactly three comma-separated integers (m,n,k), e.g. --tiles 256,256,32"
-            )
-
-        return sizes
-
-    parser = argparse.ArgumentParser(description="Create Stream-K test configs")
-    parser.add_argument(
-        "--cu_count", required=True, help="Number of Compute Units on the device"
-    )
-    parser.add_argument(
-        "--configs_dir_path",
-        required=True,
-        help="Full path configs directory where config files will be written to",
-    )
-
-    parser.add_argument(
-        "--tiles",
-        required=True,
-        type=tile_sizes_type,
-        help="Block tile sizes for m, n, and k, respectively. Ex: --tiles 256,256,32",
-    )
-
-    parser.add_argument(
-        "--datatype",
-        choices=["fp16", "bf16", "fp8", "bf8"],
-        required=True,
-        help="The datatype for which the config is generated.",
-    )
-
-    args = parser.parse_args()
-
-    return (int(args.cu_count), args.configs_dir_path, args.tiles, args.datatype)
-
-
-def main():
-    cu_count, configs_dir_path, tile_sizes, datatype = get_args()
-    create_config_files(cu_count, configs_dir_path, tile_sizes, datatype)
-    sys.exit(0)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/ck_tile/gemm_streamk_tile_engine/test_gemm_streamk_simple.cpp b/test/ck_tile/gemm_streamk_tile_engine/test_gemm_streamk_simple.cpp
deleted file mode 100644
index 284feb477d..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/test_gemm_streamk_simple.cpp
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-/**
- * @file test_gemm_simple.cpp
- * @brief Unit tests for GEMM kernels generated by gemm_instance_builder
- *
- * This test includes kernels generated during CMake configuration by
- * gemm_instance_builder.py and tests them with problem sizes extracted
- * from the corresponding JSON configuration files.
- */
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <tuple>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "tile_engine/ops/gemm_streamk/gemm_streamk_common.hpp"
-
-// The kernel header is included via compile command line with -include flag
-// It defines SelectedKernel struct, KERNEL_NAME, and tensor data types
-
-// Adaptive error threshold calculation matching tile_engine's implementation
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-/// @brief Function to compare the results of the device and host computations (from tile_engine)
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-bool compare_results(std::string instanceName,
-                     ck_tile::index_t K,
-                     ck_tile::index_t kbatch,
-                     ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                     ck_tile::HostTensor<CDataType>& c_m_n_host_result)
-{
-    const float max_accumulated_value =
-        std::abs(static_cast<float>(*std::max_element(c_m_n_host_result.mData.begin(),
-                                                      c_m_n_host_result.mData.end(),
-                                                      [](CDataType a, CDataType b) {
-                                                          return std::abs(static_cast<float>(a)) <
-                                                                 std::abs(static_cast<float>(b));
-                                                      })));
-    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-        K, kbatch, max_accumulated_value);
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_host_result,
-                                   "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
-
-    std::cout << "For " << instanceName << " Relative error threshold is "
-              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
-              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
-
-    return pass;
-}
-
-// Test parameter structure for matrix dimensions and split_k values
-struct GemmTestParams
-{
-    int m, n, k, split_k;
-};
-
-// Include config-specific test parameters (after GemmTestParams struct is defined)
-#ifdef GEMM_TEST_PARAMS_HPP
-#include GEMM_TEST_PARAMS_HPP
-#endif
-
-class StreamKGemmTileEngineTest : public ::testing::TestWithParam<GemmTestParams>
-{
-    protected:
-    void SetUp() override
-    {
-        auto params = GetParam();
-        m_          = params.m;
-        n_          = params.n;
-        k_          = params.k;
-        split_k_    = params.split_k;
-
-        // Calculate strides (following tile_engine pattern)
-        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
-        {
-            stride_a_ = k_;
-        }
-        else
-        {
-            stride_a_ = m_;
-        }
-
-        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-        {
-            stride_b_ = n_;
-        }
-        else
-        {
-            stride_b_ = k_;
-        }
-
-        if constexpr(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-        {
-            stride_c_ = n_;
-        }
-        else
-        {
-            stride_c_ = m_;
-        }
-    }
-
-    // Test dimensions
-    int m_, n_, k_, split_k_;
-    int stride_a_, stride_b_, stride_c_;
-};
-
-TEST_P(StreamKGemmTileEngineTest, BasicFunctionality)
-{
-    // Check that kernel information is available
-    EXPECT_TRUE(strlen(KERNEL_NAME) > 0) << "Kernel name should not be empty";
-
-    std::cout << "Testing kernel: " << KERNEL_NAME << std::endl;
-    std::cout << "Problem size: " << m_ << "x" << n_ << "x" << k_ << std::endl;
-
-    // Get tensor layouts from generated kernel
-    const ALayout layout_a = ALayout{};
-    const BLayout layout_b = BLayout{};
-    const CLayout layout_c = CLayout{};
-
-    // Calculate tensor strides
-    int stride_a_calc = ck_tile::get_default_stride(m_, k_, 0, is_row_major(layout_a));
-    int stride_b_calc = ck_tile::get_default_stride(k_, n_, 0, is_row_major(layout_b));
-    int stride_c_calc = ck_tile::get_default_stride(m_, n_, 0, is_row_major(layout_c));
-
-    // Create host tensors with proper descriptors
-    ck_tile::HostTensor<ADataType> a_m_k(
-        ck_tile::host_tensor_descriptor(m_, k_, stride_a_calc, is_row_major(layout_a)));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(k_, n_, stride_b_calc, is_row_major(layout_b)));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-        ck_tile::host_tensor_descriptor(m_, n_, stride_c_calc, is_row_major(layout_c)));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_ref(
-        ck_tile::host_tensor_descriptor(m_, n_, stride_c_calc, is_row_major(layout_c)));
-
-    // Initialize input tensors with uniform random distribution [-1.0, 1.0] (matches tile_engine)
-    ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-    ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
-    c_m_n_dev_ref.SetZero();
-
-    // Allocate GPU device memory
-    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem ref_c_m_n_dev_buf(c_m_n_dev_ref.get_element_space_size_in_bytes());
-
-    // Copy data to device and zero output buffer
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    b_k_n_dev_buf.ToDevice(b_k_n.data());
-    c_m_n_dev_buf.SetZero();
-    ref_c_m_n_dev_buf.SetZero();
-
-    // Calculate reference result on device for verification
-    ADataType* a_m_k_dev_ref_ptr = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
-    BDataType* b_k_n_dev_ref_ptr = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
-    CDataType* c_m_n_dev_ref_ptr = static_cast<CDataType*>(ref_c_m_n_dev_buf.GetDeviceBuffer());
-    ck_tile::
-        reference_gemm_gpu<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            a_m_k_dev_ref_ptr,
-            b_k_n_dev_ref_ptr,
-            c_m_n_dev_ref_ptr,
-            m_,
-            n_,
-            k_,
-            stride_a_calc,
-            stride_b_calc,
-            stride_c_calc);
-    ref_c_m_n_dev_buf.FromDevice(c_m_n_dev_ref.data());
-
-    // Create GEMM kernel arguments
-    ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
-                                  b_k_n_dev_buf.GetDeviceBuffer(),
-                                  c_m_n_dev_buf.GetDeviceBuffer(),
-                                  m_,
-                                  n_,
-                                  k_,
-                                  stride_a_calc,
-                                  stride_b_calc,
-                                  stride_c_calc};
-
-    // Configure kernel execution for maximum speed (no timing, no debug output)
-    ck_tile::stream_config stream_config{nullptr, // stream
-                                         false,   // time_kernel (disable timing for speed)
-                                         0,       // log_level (disable debug output)
-                                         0,       // n_warmup
-                                         1,       // n_repeat
-                                         false,   // is_gpu_timer (unused when time_kernel=false)
-                                         false,   // flush_cache
-                                         1};      // rotating_count
-
-    // Launch the generated kernel (no timing overhead for fastest execution)
-    std::tuple<float, ck_tile::index_t> launch_result;
-    try
-    {
-        launch_result = SelectedKernel::launch(args, stream_config);
-        // Kernel launched successfully if no exception thrown
-    }
-    catch(const std::exception& e)
-    {
-        std::string error_msg(e.what());
-        // If arguments not supported, skip the test (configuration validation failure, not a bug)
-        if(error_msg.find("Arguments not supported") != std::string::npos)
-        {
-            GTEST_SKIP() << "Configuration not supported: " << e.what();
-        }
-        else
-        {
-            FAIL() << "Kernel launch failed: " << e.what();
-        }
-    }
-
-    // Copy result back from device
-    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-
-    // Verify results using tile_engine's adaptive error thresholds
-    const ck_tile::index_t num_wgs_per_tile = get<1>(launch_result);
-    bool verification_passed = compare_results<ADataType, BDataType, AccDataType, CDataType>(
-        KERNEL_NAME, k_, num_wgs_per_tile, c_m_n_dev_result, c_m_n_dev_ref);
-
-    EXPECT_TRUE(verification_passed) << "GEMM result verification failed";
-}
-
-// Use config-specific test parameters (included via compile flags)
-// CONFIG_TEST_PARAMS is defined in the auto-generated test_params.hpp file
-INSTANTIATE_TEST_SUITE_P(GemmVerification,
-                         StreamKGemmTileEngineTest,
-                         ::testing::ValuesIn(CONFIG_TEST_PARAMS),
-                         [](const ::testing::TestParamInfo<GemmTestParams>& param_info) {
-                             return std::to_string(param_info.param.m) + "x" +
-                                    std::to_string(param_info.param.n) + "x" +
-                                    std::to_string(param_info.param.k) + "_splitk" +
-                                    std::to_string(param_info.param.split_k);
-                         });
diff --git a/test/ck_tile/gemm_tile_engine/CMakeLists.txt b/test/ck_tile/gemm_tile_engine/CMakeLists.txt
index 33effcc120..4cecba0e8a 100644
--- a/test/ck_tile/gemm_tile_engine/CMakeLists.txt
+++ b/test/ck_tile/gemm_tile_engine/CMakeLists.txt
@@ -232,7 +232,7 @@ message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # GPU architecture filtering - only build tests for supported architectures
 set(GEMM_TEST_GPU_TARGETS "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -243,7 +243,7 @@ endforeach()
 
 # Early exit if no compatible GPU architectures are available
 if(NOT GEMM_TEST_GPU_TARGETS)
-    message(WARNING "Skipping GEMM Tile Engine tests: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping GEMM Tile Engine tests: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
     return()
 endif()
 
diff --git a/test/ck_tile/grouped_conv/CMakeLists.txt b/test/ck_tile/grouped_conv/CMakeLists.txt
index 5bc10ffddd..b1c1d77205 100644
--- a/test/ck_tile/grouped_conv/CMakeLists.txt
+++ b/test/ck_tile/grouped_conv/CMakeLists.txt
@@ -5,3 +5,9 @@
 if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_gtest_executable(test_ck_tile_grouped_conv_bwd_weight test_ck_tile_grouped_conv_bwd_weight.cpp)
 endif()
+
+# StreamK requires cross-CU coherence via StreamKCoherency, which only has
+# specializations for CDNA architectures (gfx90a/gfx942/gfx950).
+if(GPU_TARGETS MATCHES "gfx90a|gfx942|gfx950")
+    add_gtest_executable(test_ck_tile_grouped_conv_bwd_weight_streamk test_ck_tile_grouped_conv_bwd_weight_streamk.cpp)
+endif()
diff --git a/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight.cpp b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight.cpp
index 237641a000..4ea3479db0 100644
--- a/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight.cpp
+++ b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight.cpp
@@ -219,12 +219,12 @@ TEST_F(GroupedConvBwdWeightIsSupportedArgumentTest, K0KBatchLimitation)
                                         tensor_layout::convolution::NHWGK>::type;
 
     // k_batch = 128 should pass
-    auto host_args_kbatch_6 = create_2d_host_args(6);
+    auto host_args_kbatch_6 = create_2d_host_args(7);
     auto kargs_6 = typename Kernel::GroupedConvBwdWeightKernelArgsSpecialized(host_args_kbatch_6);
     EXPECT_TRUE(Kernel::IsSupportedArgument(kargs_6));
 
     // k_batch = 129 should fail for half_t output
-    auto host_args_kbatch_7 = create_2d_host_args(7);
+    auto host_args_kbatch_7 = create_2d_host_args(8);
     auto kargs_7 = typename Kernel::GroupedConvBwdWeightKernelArgsSpecialized(host_args_kbatch_7);
     EXPECT_FALSE(Kernel::IsSupportedArgument(kargs_7));
 }
diff --git a/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp
new file mode 100644
index 0000000000..da51c2a9cd
--- /dev/null
+++ b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp
@@ -0,0 +1,641 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "gtest/gtest.h"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp"
+#include "ck_tile/host/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/fill.hpp"
+
+using namespace ck_tile;
+
+struct StreamKTestConvConfig
+{
+    static constexpr index_t VectorSizeA = 4;
+    static constexpr index_t VectorSizeB = 8;
+    static constexpr index_t VectorSizeC = 8;
+
+    static constexpr index_t M_Tile = 128;
+    static constexpr index_t N_Tile = 128;
+    static constexpr index_t K_Tile = 32;
+
+    static constexpr index_t M_Warp = 2;
+    static constexpr index_t N_Warp = 2;
+    static constexpr index_t K_Warp = 1;
+
+    static constexpr index_t M_Warp_Tile = 16;
+    static constexpr index_t N_Warp_Tile = 16;
+    static constexpr index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer    = false;
+    static constexpr GemmPipeline Pipeline    = GemmPipeline::COMPUTE_V3;
+    static constexpr index_t NumWaveGroups    = 1;
+    static constexpr index_t NumGroupsToMerge = 1;
+    static constexpr auto Scheduler           = GemmPipelineScheduler::Intrawave;
+};
+
+// Build a conv bwd weight kernel type from a tile partitioner.
+// Works for both StreamK and Split-K partitioners.
+template <typename PrecType,
+          typename ConvConfig,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename TilePartitioner_,
+          index_t NDimSpatial = 2>
+struct BuildKernel
+{
+    using GemmShape = TileGemmShape<
+        sequence<ConvConfig::M_Tile, ConvConfig::N_Tile, ConvConfig::K_Tile>,
+        sequence<ConvConfig::M_Warp, ConvConfig::N_Warp, ConvConfig::K_Warp>,
+        sequence<ConvConfig::M_Warp_Tile, ConvConfig::N_Warp_Tile, ConvConfig::K_Warp_Tile>>;
+
+    using ConvTraits = GroupedConvTraits<NDimSpatial,
+                                         ConvolutionSpecialization::Default,
+                                         InLayout,
+                                         WeiLayout,
+                                         tuple<>,
+                                         OutLayout,
+                                         ConvConfig::VectorSizeA,
+                                         ConvConfig::VectorSizeB,
+                                         ConvConfig::VectorSizeC,
+                                         ConvConfig::NumGroupsToMerge>;
+
+    using GemmUniversalTraits =
+        TileGemmUniversalTraits<ConvTraits::FixedGemmParams::kPadM,
+                                ConvTraits::FixedGemmParams::kPadN,
+                                ConvTraits::FixedGemmParams::kPadK,
+                                ConvConfig::DoubleSmemBuffer,
+                                typename ConvTraits::AsLayoutBwdWeight,
+                                typename ConvTraits::BsLayoutBwdWeight,
+                                typename ConvTraits::CLayoutBwdWeight,
+                                ConvTraits::FixedGemmParams::TransposeC,
+                                ConvTraits::FixedGemmParams::UseStructuredSparsity,
+                                ConvTraits::FixedGemmParams::Persistent,
+                                ConvConfig::NumWaveGroups>;
+
+    using UniversalGemmProblem =
+        UniversalGemmPipelineProblem<PrecType,
+                                     PrecType,
+                                     float,
+                                     GemmShape,
+                                     GemmUniversalTraits,
+                                     ConvConfig::Scheduler,
+                                     element_wise::PassThrough,
+                                     element_wise::PassThrough,
+                                     PrecType,
+                                     ConvTraits::FixedGemmParams::FixedVectorSize,
+                                     ConvTraits::VectorSizeA,
+                                     ConvTraits::VectorSizeB>;
+
+    using GemmPipeline_ = GemmPipelineAgBgCrCompV3<UniversalGemmProblem>;
+
+    using EpilogueProblem = CShuffleEpilogueProblem<PrecType,
+                                                    PrecType,
+                                                    tuple<>,
+                                                    float,
+                                                    PrecType,
+                                                    typename ConvTraits::ImplicitGemmDsLayout,
+                                                    typename ConvTraits::FixedGemmParams::ELayout,
+                                                    element_wise::PassThrough,
+                                                    TilePartitioner_::MPerBlock,
+                                                    TilePartitioner_::NPerBlock,
+                                                    ConvConfig::M_Warp,
+                                                    ConvConfig::N_Warp,
+                                                    ConvConfig::M_Warp_Tile,
+                                                    ConvConfig::N_Warp_Tile,
+                                                    ConvConfig::K_Warp_Tile,
+                                                    ConvTraits::FixedGemmParams::TransposeC,
+                                                    ConvConfig::NumWaveGroups,
+                                                    ConvTraits::FixedGemmParams::FixedVectorSize,
+                                                    ConvTraits::VectorSizeC>;
+
+    using Epilogue = CShuffleEpilogue<EpilogueProblem>;
+
+    using type = GroupedConvolutionBackwardWeightKernel<ConvTraits,
+                                                        TilePartitioner_,
+                                                        GemmPipeline_,
+                                                        Epilogue>;
+};
+
+// Helper to create 2D host args
+static GroupedConvBwdWeightHostArgs create_host_args(index_t G,
+                                                     index_t N,
+                                                     index_t K,
+                                                     index_t C,
+                                                     index_t Y,
+                                                     index_t X,
+                                                     index_t Hi,
+                                                     index_t Wi,
+                                                     index_t stride_y,
+                                                     index_t stride_x,
+                                                     index_t dilation_y,
+                                                     index_t dilation_x,
+                                                     index_t left_pad_y,
+                                                     index_t left_pad_x,
+                                                     index_t right_pad_y,
+                                                     index_t right_pad_x,
+                                                     index_t k_batch = 1)
+{
+    auto conv_param = conv::ConvParam{2,
+                                      G,
+                                      N,
+                                      K,
+                                      C,
+                                      {Y, X},
+                                      {Hi, Wi},
+                                      {stride_y, stride_x},
+                                      {dilation_y, dilation_x},
+                                      {left_pad_y, left_pad_x},
+                                      {right_pad_y, right_pad_x}};
+
+    return GroupedConvBwdWeightHostArgs{conv_param, nullptr, nullptr, {}, nullptr, k_batch};
+}
+
+// Common type aliases
+using InLayout  = tensor_layout::convolution::NHWGC;
+using WeiLayout = tensor_layout::convolution::GKYXC;
+using OutLayout = tensor_layout::convolution::NHWGK;
+using PrecType  = half_t;
+
+using TestGemmShape =
+    TileGemmShape<sequence<128, 128, 32>, sequence<2, 2, 1>, sequence<16, 16, 16>>;
+
+using SplitKPartitioner = GemmSpatiallyLocalTilePartitioner<TestGemmShape, 8, 4>;
+using LinearPartitioner =
+    StreamKTilePartitioner<TestGemmShape, StreamKReductionStrategy::Linear, false>;
+using TreePartitioner =
+    StreamKTilePartitioner<TestGemmShape, StreamKReductionStrategy::Tree, false>;
+using LinearPersistentPartitioner =
+    StreamKTilePartitioner<TestGemmShape, StreamKReductionStrategy::Linear, true>;
+using TreePersistentPartitioner =
+    StreamKTilePartitioner<TestGemmShape, StreamKReductionStrategy::Tree, true>;
+
+template <typename Partitioner>
+using TestKernel = typename BuildKernel<PrecType,
+                                        StreamKTestConvConfig,
+                                        InLayout,
+                                        WeiLayout,
+                                        OutLayout,
+                                        Partitioner>::type;
+
+// ============================================================================
+// Host-side unit tests
+// ============================================================================
+
+TEST(StreamKConvBwdWeight, TypeTraitDetection)
+{
+    EXPECT_FALSE(is_streamk_partitioner<SplitKPartitioner>::value);
+    EXPECT_TRUE(is_streamk_partitioner<LinearPartitioner>::value);
+    EXPECT_TRUE(is_streamk_partitioner<TreePartitioner>::value);
+}
+
+TEST(StreamKConvBwdWeight, KernelArgsConstruction_LinearPartitioner)
+{
+    using Kernel = TestKernel<LinearPartitioner>;
+    EXPECT_TRUE(Kernel::IsStreamK);
+
+    auto host_args = create_host_args(1, 4, 128, 128, 3, 3, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+    auto kargs     = Kernel::MakeKernelArgs(host_args, /*num_cu=*/4, /*occupancy=*/1);
+
+    EXPECT_EQ(kargs.k_batch, 1);
+    EXPECT_GT(kargs.GemmM, 0);
+    EXPECT_GT(kargs.GemmN, 0);
+    EXPECT_GT(kargs.GemmK, 0);
+    EXPECT_GT(kargs.tile_partitioner.get_max_active_wgs(), 0);
+}
+
+TEST(StreamKConvBwdWeight, KernelArgsConstruction_TreePartitioner)
+{
+    using Kernel = TestKernel<TreePartitioner>;
+    EXPECT_TRUE(Kernel::IsStreamK);
+
+    auto host_args = create_host_args(1, 4, 128, 128, 3, 3, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+    auto kargs     = Kernel::MakeKernelArgs(host_args, /*num_cu=*/4, /*occupancy=*/1);
+
+    EXPECT_EQ(kargs.k_batch, 1);
+    EXPECT_GT(kargs.GemmM, 0);
+    EXPECT_GT(kargs.GemmN, 0);
+    EXPECT_GT(kargs.GemmK, 0);
+    EXPECT_GT(kargs.tile_partitioner.get_max_active_wgs(), 0);
+}
+
+TEST(StreamKConvBwdWeight, GridSize)
+{
+    using Kernel = TestKernel<LinearPartitioner>;
+
+    auto host_args = create_host_args(1, 4, 128, 128, 3, 3, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+    auto kargs     = Kernel::MakeKernelArgs(host_args, /*num_cu=*/4, /*occupancy=*/1);
+    auto grid      = Kernel::GridSize(kargs);
+
+    auto sk_grid = kargs.tile_partitioner.grid_size();
+    EXPECT_EQ(grid.x, sk_grid.x);
+    EXPECT_EQ(grid.y, static_cast<unsigned int>(kargs.GemmBatch));
+    EXPECT_EQ(grid.z, 1u);
+}
+
+TEST(StreamKConvBwdWeight, WorkSpaceSize)
+{
+    using Kernel = TestKernel<LinearPartitioner>;
+
+    auto host_args = create_host_args(1, 4, 128, 128, 3, 3, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+    auto kargs     = Kernel::MakeKernelArgs(host_args, /*num_cu=*/4, /*occupancy=*/1);
+
+    EXPECT_GT(Kernel::GetWorkSpaceSize(kargs), 0);
+}
+
+TEST(StreamKConvBwdWeight, SplitKNoWorkspace)
+{
+    using Kernel = TestKernel<SplitKPartitioner>;
+    EXPECT_FALSE(Kernel::IsStreamK);
+
+    auto host_args = create_host_args(1, 4, 128, 128, 3, 3, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+    auto kargs     = Kernel::MakeKernelArgs(host_args);
+
+    EXPECT_EQ(Kernel::GetWorkSpaceSize(kargs), 0);
+}
+
+// ============================================================================
+// GPU end-to-end tests: StreamK vs Split-K=1 reference
+// ============================================================================
+
+template <typename StreamKKernelType>
+static bool run_streamk_vs_splitk_test(index_t G,
+                                       index_t N,
+                                       index_t K,
+                                       index_t C,
+                                       index_t Y,
+                                       index_t X,
+                                       index_t Hi,
+                                       index_t Wi,
+                                       index_t num_cu,
+                                       index_t occupancy,
+                                       index_t stride_h   = 1,
+                                       index_t stride_w   = 1,
+                                       index_t dilation_h = 1,
+                                       index_t dilation_w = 1,
+                                       index_t lpad_h     = 1,
+                                       index_t lpad_w     = 1,
+                                       index_t rpad_h     = 1,
+                                       index_t rpad_w     = 1)
+{
+    using RefKernel               = TestKernel<SplitKPartitioner>;
+    constexpr index_t NDimSpatial = 2;
+
+    auto conv_param = conv::ConvParam{NDimSpatial,
+                                      G,
+                                      N,
+                                      K,
+                                      C,
+                                      {Y, X},
+                                      {Hi, Wi},
+                                      {stride_h, stride_w},
+                                      {dilation_h, dilation_w},
+                                      {lpad_h, lpad_w},
+                                      {rpad_h, rpad_w}};
+
+    const auto in_desc =
+        conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_desc =
+        conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_desc =
+        conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    HostTensor<PrecType> input(in_desc);
+    HostTensor<PrecType> output(out_desc);
+    HostTensor<PrecType> weight_ref(wei_desc);
+    HostTensor<PrecType> weight_streamk(wei_desc);
+
+    FillUniformDistribution<PrecType>{-1.f, 1.f}(input);
+    FillUniformDistribution<PrecType>{-1.f, 1.f}(output);
+
+    DeviceMem input_dev(input.get_element_space_size_in_bytes());
+    DeviceMem output_dev(output.get_element_space_size_in_bytes());
+    DeviceMem weight_ref_dev(weight_ref.get_element_space_size_in_bytes());
+    DeviceMem weight_streamk_dev(weight_streamk.get_element_space_size_in_bytes());
+
+    input_dev.ToDevice(input.data());
+    output_dev.ToDevice(output.data());
+
+    // Reference: Split-K=1
+    {
+        weight_ref_dev.SetZero();
+
+        GroupedConvBwdWeightHostArgs host_args(conv_param,
+                                               input_dev.GetDeviceBuffer(),
+                                               weight_ref_dev.GetDeviceBuffer(),
+                                               {},
+                                               output_dev.GetDeviceBuffer(),
+                                               /*k_batch=*/1);
+
+        auto kargs = RefKernel::MakeKernelArgs(host_args);
+        if(!RefKernel::IsSupportedArgument(kargs))
+        {
+            std::cout << "Split-K kernel does not support this shape, skipping\n";
+            return true;
+        }
+
+        auto kernel_func = make_kernel<1>(
+            RefKernel{}, RefKernel::GridSize(kargs), RefKernel::BlockSize(), 0, kargs);
+        launch_kernel(stream_config{nullptr, false}, kernel_func);
+        hip_check_error(hipDeviceSynchronize());
+    }
+
+    // StreamK under test
+    {
+        weight_streamk_dev.SetZero();
+
+        GroupedConvBwdWeightHostArgs host_args(conv_param,
+                                               input_dev.GetDeviceBuffer(),
+                                               weight_streamk_dev.GetDeviceBuffer(),
+                                               {},
+                                               output_dev.GetDeviceBuffer(),
+                                               /*k_batch=*/1);
+
+        auto kargs   = StreamKKernelType::MakeKernelArgs(host_args, num_cu, occupancy);
+        auto ws_size = StreamKKernelType::GetWorkSpaceSize(kargs);
+        DeviceMem workspace_dev(ws_size);
+        workspace_dev.SetZero();
+        StreamKKernelType::SetWorkSpacePointer(kargs, workspace_dev.GetDeviceBuffer());
+
+        auto kernel_func = make_kernel<1>(StreamKKernelType{},
+                                          StreamKKernelType::GridSize(kargs),
+                                          StreamKKernelType::BlockSize(),
+                                          0,
+                                          kargs);
+        launch_kernel(stream_config{nullptr, false}, kernel_func);
+        hip_check_error(hipDeviceSynchronize());
+    }
+
+    weight_ref_dev.FromDevice(weight_ref.data());
+    weight_streamk_dev.FromDevice(weight_streamk.data());
+
+    // Compute GemmK = N * product(output_spatial_lengths) for bwd weight
+    const index_t GemmK = N * std::accumulate(conv_param.output_spatial_lengths_.begin(),
+                                              conv_param.output_spatial_lengths_.end(),
+                                              static_cast<index_t>(1),
+                                              std::multiplies<index_t>());
+
+    // Max accumulated value calibrates atol to the output's ULP scale.
+    const float max_accumulated_value =
+        *std::max_element(weight_ref.mData.begin(), weight_ref.mData.end());
+
+    // Tolerance follows the calculate_rtol_atol pattern from conv examples:
+    // (1) GEMM accumulation error: fp16 compute, fp16 output, f32 accumulator
+    // (2) Reduction error: accounts for fp16 output quantization differences
+    //     when two f32 results (from different accumulation orders) round to fp16
+    using ComputeType        = PrecType;
+    using AccType            = float;
+    constexpr index_t kbatch = 1;
+    const auto rtol_gemm =
+        get_relative_threshold<ComputeType, PrecType, AccType>(integer_divide_ceil(GemmK, kbatch));
+    const auto atol_gemm = get_absolute_threshold<ComputeType, PrecType, AccType>(
+        max_accumulated_value / kbatch, integer_divide_ceil(GemmK, kbatch));
+    const auto rtol_reduction = get_relative_threshold<PrecType, PrecType, PrecType>(kbatch);
+    const auto atol_reduction =
+        get_absolute_threshold<PrecType, PrecType, PrecType>(max_accumulated_value, kbatch);
+
+    const double rtol = std::max(rtol_gemm, rtol_reduction);
+    const double atol = std::max(atol_gemm, atol_reduction);
+
+    return check_err(weight_streamk, weight_ref, "StreamK vs SplitK mismatch", rtol, atol);
+}
+
+// Linear Reduction
+TEST(StreamKConvBwdWeight, Linear_EndToEnd_SmallShape)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
+        1, 4, 128, 128, 3, 3, 16, 16, 2, 1)));
+}
+
+TEST(StreamKConvBwdWeight, Linear_EndToEnd_MediumShape)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
+        1, 8, 256, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+TEST(StreamKConvBwdWeight, Linear_EndToEnd_MoreSKWork)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
+        1, 4, 128, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+TEST(StreamKConvBwdWeight, Linear_EndToEnd_MultiGroup)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
+        2, 4, 128, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+// Tree Reduction
+TEST(StreamKConvBwdWeight, Tree_EndToEnd_SmallShape)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePartitioner>>(
+        1, 4, 128, 128, 3, 3, 16, 16, 2, 1)));
+}
+
+TEST(StreamKConvBwdWeight, Tree_EndToEnd_MediumShape)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePartitioner>>(
+        1, 8, 256, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+TEST(StreamKConvBwdWeight, Tree_EndToEnd_MoreSKWork)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePartitioner>>(
+        1, 4, 128, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+TEST(StreamKConvBwdWeight, Tree_EndToEnd_MultiGroup)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePartitioner>>(
+        2, 4, 128, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+// Stride > 1 — shrinks Ho/Wo, changing the K/tile ratio and DP/SK split.
+// Hi=16, Wi=16, 3x3 filter, stride=2, pad=1 → Ho=Wo=8, GemmK=N*64
+TEST(StreamKConvBwdWeight, Linear_EndToEnd_Stride2)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(1,
+                                                                           4,
+                                                                           128,
+                                                                           128,
+                                                                           3,
+                                                                           3,
+                                                                           16,
+                                                                           16,
+                                                                           4,
+                                                                           1,
+                                                                           /*stride=*/2,
+                                                                           2,
+                                                                           /*dil=*/1,
+                                                                           1,
+                                                                           /*pad=*/1,
+                                                                           1,
+                                                                           1,
+                                                                           1)));
+}
+
+TEST(StreamKConvBwdWeight, Tree_EndToEnd_Stride2)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePartitioner>>(1,
+                                                                         4,
+                                                                         128,
+                                                                         128,
+                                                                         3,
+                                                                         3,
+                                                                         16,
+                                                                         16,
+                                                                         4,
+                                                                         1,
+                                                                         /*stride=*/2,
+                                                                         2,
+                                                                         /*dil=*/1,
+                                                                         1,
+                                                                         /*pad=*/1,
+                                                                         1,
+                                                                         1,
+                                                                         1)));
+}
+
+// Pure DP — num_tiles evenly divides grid, so sk_ctas=0.
+// K=256, C=128, 3x3 → GemmM=256, GemmN=1152 → tiles=2*9=18, grid=3*1=3, 18%3=0
+TEST(StreamKConvBwdWeight, Linear_EndToEnd_PureDP)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
+        1, 4, 256, 128, 3, 3, 16, 16, 3, 1)));
+}
+
+TEST(StreamKConvBwdWeight, Tree_EndToEnd_PureDP)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePartitioner>>(
+        1, 4, 256, 128, 3, 3, 16, 16, 3, 1)));
+}
+
+// Single output tile — all work is SK, zero DP tiles.
+// K=128, C=128, 1x1 filter, stride=1, pad=0 → GemmM=128, GemmN=128, tiles=1
+TEST(StreamKConvBwdWeight, Linear_EndToEnd_SingleTile)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(1,
+                                                                           4,
+                                                                           128,
+                                                                           128,
+                                                                           1,
+                                                                           1,
+                                                                           16,
+                                                                           16,
+                                                                           4,
+                                                                           1,
+                                                                           /*stride=*/1,
+                                                                           1,
+                                                                           /*dil=*/1,
+                                                                           1,
+                                                                           /*pad=*/0,
+                                                                           0,
+                                                                           0,
+                                                                           0)));
+}
+
+TEST(StreamKConvBwdWeight, Tree_EndToEnd_SingleTile)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePartitioner>>(1,
+                                                                         4,
+                                                                         128,
+                                                                         128,
+                                                                         1,
+                                                                         1,
+                                                                         16,
+                                                                         16,
+                                                                         4,
+                                                                         1,
+                                                                         /*stride=*/1,
+                                                                         1,
+                                                                         /*dil=*/1,
+                                                                         1,
+                                                                         /*pad=*/0,
+                                                                         0,
+                                                                         0,
+                                                                         0)));
+}
+
+// Large N — GemmK = 32*16*16 = 8192, many K iterations per tile.
+TEST(StreamKConvBwdWeight, Linear_EndToEnd_LargeN)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
+        1, 32, 128, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+TEST(StreamKConvBwdWeight, Tree_EndToEnd_LargeN)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePartitioner>>(
+        1, 32, 128, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+// Higher occupancy — doubles the grid, more SK CTAs share tiles.
+TEST(StreamKConvBwdWeight, Linear_EndToEnd_HigherOccupancy)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
+        1, 4, 128, 128, 3, 3, 16, 16, 4, 2)));
+}
+
+TEST(StreamKConvBwdWeight, Tree_EndToEnd_HigherOccupancy)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePartitioner>>(
+        1, 4, 128, 128, 3, 3, 16, 16, 4, 2)));
+}
+
+// Persistent DP — workgroups loop over DP tiles, then do SK work.
+TEST(StreamKConvBwdWeight, LinearPersistent_EndToEnd_SmallShape)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPersistentPartitioner>>(
+        1, 4, 128, 128, 3, 3, 16, 16, 2, 1)));
+}
+
+TEST(StreamKConvBwdWeight, TreePersistent_EndToEnd_SmallShape)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePersistentPartitioner>>(
+        1, 4, 128, 128, 3, 3, 16, 16, 2, 1)));
+}
+
+TEST(StreamKConvBwdWeight, LinearPersistent_EndToEnd_MultiGroup)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPersistentPartitioner>>(
+        2, 4, 128, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+TEST(StreamKConvBwdWeight, TreePersistent_EndToEnd_MultiGroup)
+{
+    EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<TreePersistentPartitioner>>(
+        2, 4, 128, 128, 3, 3, 16, 16, 4, 1)));
+}
+
+// ============================================================================
+// Negative tests: IsSupportedArgument should reject invalid shapes
+// ============================================================================
+
+// C not divisible by VectorSizeB (=8) → rejected
+TEST(StreamKConvBwdWeight, IsSupportedArgument_RejectsUnalignedC)
+{
+    using Kernel = TestKernel<LinearPartitioner>;
+
+    auto host_args = create_host_args(1, 4, 128, 100, 3, 3, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+    auto kargs     = Kernel::MakeKernelArgs(host_args, /*num_cu=*/4, /*occupancy=*/1);
+
+    EXPECT_FALSE(Kernel::IsSupportedArgument(kargs));
+}
+
+// K not divisible by VectorSizeA (=4) → rejected
+TEST(StreamKConvBwdWeight, IsSupportedArgument_RejectsUnalignedK)
+{
+    using Kernel = TestKernel<TreePartitioner>;
+
+    auto host_args = create_host_args(1, 4, 103, 128, 3, 3, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+    auto kargs     = Kernel::MakeKernelArgs(host_args, /*num_cu=*/4, /*occupancy=*/1);
+
+    EXPECT_FALSE(Kernel::IsSupportedArgument(kargs));
+}
diff --git a/test/ck_tile/grouped_gemm_quant/test_grouped_gemm_quant.cpp b/test/ck_tile/grouped_gemm_quant/test_grouped_gemm_quant.cpp
deleted file mode 100644
index 6a1a28884a..0000000000
--- a/test/ck_tile/grouped_gemm_quant/test_grouped_gemm_quant.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-
-#include "ck_tile/host.hpp"
-#include "test_grouped_gemm_util_quant.hpp"
-
-using F16         = ck_tile::half_t;
-using F32         = float;
-using FP8         = ck_tile::fp8_t;
-using BF8         = ck_tile::bf8_t;
-using Row         = ck_tile::tensor_layout::gemm::RowMajor;
-using Col         = ck_tile::tensor_layout::gemm::ColumnMajor;
-using True        = ck_tile::bool_constant<true>;
-using False       = ck_tile::bool_constant<false>;
-using RowColQuant = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::RowColQuant>;
-using TensorQuant = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::TensorQuant>;
-using AQuant      = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-using BQuant      = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::BQuantGrouped>;
-
-// clang-format off
-using KernelTypes = ::testing::Types<
-    //         ALayout, BLayout, CLayout, ADataType, AQDataType, BDataType, BQDataType, AccDataType, CDataType,   QuantType, PreshuffleB, Persistent, TransposeC
-    std::tuple<    Row,     Col,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16, RowColQuant,       False,       True,      False>,
-    std::tuple<    Col,     Col,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16, RowColQuant,       False,       True,      False>,
-    std::tuple<    Row,     Row,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16, RowColQuant,       False,       True,      False>,
-    std::tuple<    Col,     Row,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16, RowColQuant,       False,       True,      False>,
-
-    std::tuple<    Row,     Col,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16, RowColQuant,       False,       True,      False>,
-    std::tuple<    Col,     Col,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16, RowColQuant,       False,       True,      False>,
-    std::tuple<    Row,     Row,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16, RowColQuant,       False,       True,      False>,
-    std::tuple<    Col,     Row,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16, RowColQuant,       False,       True,      False>,
-
-    std::tuple<    Row,     Col,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16, TensorQuant,       False,       True,      False>,
-    std::tuple<    Col,     Col,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16, TensorQuant,       False,       True,      False>,
-    std::tuple<    Row,     Row,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16, TensorQuant,       False,       True,      False>,
-    std::tuple<    Col,     Row,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16, TensorQuant,       False,       True,      False>,
-
-    std::tuple<    Row,     Col,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16, TensorQuant,       False,       True,      False>,
-    std::tuple<    Col,     Col,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16, TensorQuant,       False,       True,      False>,
-    std::tuple<    Row,     Row,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16, TensorQuant,       False,       True,      False>,
-    std::tuple<    Col,     Row,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16, TensorQuant,       False,       True,      False>,
-    
-    std::tuple<    Row,     Col,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16,    AQuant,       False,       True,      True>,
-    std::tuple<    Row,     Col,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16,    AQuant,       False,       True,      False>,
-    std::tuple<    Row,     Col,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16,    AQuant,       False,       True,      True>,
-    std::tuple<    Row,     Col,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16,    AQuant,       False,       True,      False>,
-
-    std::tuple<    Row,     Col,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16,    BQuant,       False,       True,      False>,
-    std::tuple<    Row,     Col,     Row,       FP8,        F32,       FP8,        F32,         F32,       F16,    BQuant,        True,       True,      False>,
-    std::tuple<    Row,     Col,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16,    BQuant,       False,       True,      False>,
-    std::tuple<    Row,     Col,     Row,       BF8,        F32,       BF8,        F32,         F32,       F16,    BQuant,        True,       True,      False>
-    >;
-// clang-format on
-
-TYPED_TEST_SUITE(TestCkTileGroupedGemmQuant, KernelTypes);
-
-#include "test_grouped_gemm_quant_ut_cases.inc"
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
index 8c72b81dc1..56fcca3beb 100644
--- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
index 6d7a5e7c1f..2462cd218e 100644
--- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/test/ck_tile/pooling_tile_engine/CMakeLists.txt b/test/ck_tile/pooling_tile_engine/CMakeLists.txt
new file mode 100644
index 0000000000..d41539cb9f
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/CMakeLists.txt
@@ -0,0 +1,341 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# ============================================================================
+# Pooling Tile Engine Unit Tests
+#
+# This CMake file creates unit tests for tile_engine generated pooling kernels.
+# Each kernel configuration gets its own test executable.
+# ============================================================================
+
+# Locate tile_engine pooling scripts directory
+set(TILE_ENGINE_POOLING_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/pooling")
+
+if(NOT EXISTS ${TILE_ENGINE_POOLING_DIR})
+    message(WARNING "Tile engine pooling directory not found: ${TILE_ENGINE_POOLING_DIR}")
+    return()
+endif()
+
+# ============================================================================
+# create_individual_pool_test_target
+#
+# Creates a single test executable for a specific pooling kernel configuration.
+#
+# Parameters:
+#   datatype     - Data type (fp16, fp32, bf16)
+#   config_name  - Configuration file name without .json extension
+#   trait        - Kernel trait combination string
+#   tile_config  - Tile configuration parameters
+#   config_json  - Full path to JSON configuration file
+# ============================================================================
+function(create_individual_pool_test_target datatype config_name kernel_name trait tile_config config_json)
+    set(target_name "test_pooling_tile_engine_${datatype}_${config_name}_${trait}_${tile_config}")
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${config_name}")
+
+    # Generated header path (already created during cmake configuration)
+    # Use kernel_name from pool_kernel_list.txt to match the filename generated by pooling_instance_builder.py
+    set(test_header "${working_path}/pooling_single_${kernel_name}.hpp")
+
+    # Determine pooling dimension from trait string (format: reduce_op_output_index_propagate_nan_pooling_dim)
+    # The pooling_dim is the last field: "2d" or "3d"
+    string(REGEX MATCH "[23]d$" kernel_pooling_dim "${trait}")
+    if(kernel_pooling_dim STREQUAL "3d")
+        set(test_params_header "${working_path}/test_params_3d.hpp")
+        set(pooling_dim_value 3)
+    else()
+        set(test_params_header "${working_path}/test_params_2d.hpp")
+        set(pooling_dim_value 2)
+    endif()
+
+    # Verify header exists
+    if(NOT EXISTS ${test_header})
+        message(WARNING "Generated header not found: ${test_header}")
+        return()
+    endif()
+
+    # Verify test parameters header exists
+    if(NOT EXISTS ${test_params_header})
+        message(WARNING "Test parameters header not found: ${test_params_header}")
+        return()
+    endif()
+
+    # Create GTest executable for this kernel configuration
+    add_gtest_executable(${target_name}
+        ${CMAKE_CURRENT_SOURCE_DIR}/test_pooling_simple.cpp
+    )
+
+    # Configure GPU architectures for HIP compilation
+    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${POOLING_TEST_GPU_TARGETS})
+
+    # Define preprocessor macros for generated header location, test parameters, and pooling dimension
+    target_compile_definitions(${target_name} PRIVATE
+        POOLING_SINGLE_INSTANCE_HPP="${test_header}"
+        POOLING_TEST_PARAMS_HPP="${test_params_header}"
+        POOLING_DIM_VALUE=${pooling_dim_value}
+    )
+
+    # Include directories for headers and dependencies
+    target_include_directories(${target_name} PRIVATE
+        ${PROJECT_SOURCE_DIR}/include
+        ${PROJECT_BINARY_DIR}/include
+        ${PROJECT_SOURCE_DIR}  # Root directory for tile_engine access
+        ${GTEST_INCLUDE_DIRS}
+    )
+
+    # Compiler options matching tile_engine requirements
+    target_compile_options(${target_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+        -include ${test_header}
+    )
+
+    # Add FP8 format definitions for proper data type interpretation
+    if(CK_USE_OCP_FP8)
+        target_compile_options(${target_name} PRIVATE -DCK_TILE_USE_OCP_FP8)
+    endif()
+
+    message(STATUS "  Created test target: ${target_name}")
+endfunction()
+
+# ============================================================================
+# build_pool_test_targets
+#
+# Builds all test targets for a specific datatype/config combination.
+# Uses tile_engine's two-step process: list kernels, then generate tests.
+#
+# Parameters:
+#   datatype     - Data type (fp16, fp32, bf16)
+#   config_name  - Configuration file name without .json extension
+# ============================================================================
+function(build_pool_test_targets datatype config_name)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${config_name}")
+
+    # Locate and validate configuration file
+    set(config_filename "${config_name}.json")
+    set(json_blob "${CMAKE_CURRENT_SOURCE_DIR}/configs/${config_filename}")
+
+    if(NOT EXISTS ${json_blob})
+        message(WARNING "Test config file not found: ${json_blob}")
+        return()
+    endif()
+
+    # Prepare build directory for this configuration
+    file(MAKE_DIRECTORY ${working_path})
+
+    # STEP 1: Discovery phase - list all valid kernel configurations
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_POOLING_DIR}/pooling_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --config_json ${json_blob}
+                --list_kernels
+        WORKING_DIRECTORY ${TILE_ENGINE_POOLING_DIR}
+        RESULT_VARIABLE ret
+        OUTPUT_VARIABLE list_output
+        ERROR_VARIABLE list_error
+    )
+
+    if(NOT ret EQUAL 0)
+        message(WARNING "Failed to list pooling kernels for ${datatype}_${config_name}: ${list_error}")
+        return()
+    endif()
+
+    # Verify kernel list file was generated
+    if(NOT EXISTS ${working_path}/pool_kernel_list.txt)
+        message(STATUS "No pooling kernels found for ${datatype}_${config_name}")
+        return()
+    endif()
+
+    message(STATUS "Building pooling tests for ${datatype}_${config_name}")
+
+    # STEP 2a: Extract test parameters from config for BOTH 2D and 3D dimensions.
+    # Each kernel's pooling_dim is embedded in its trait string, so we generate
+    # separate test_params headers and select the right one per kernel target.
+    set(test_params_file_2d "${working_path}/test_params_2d.hpp")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_SOURCE_DIR}/extract_test_params.py
+                --config_file ${json_blob}
+                --output_file ${test_params_file_2d}
+                --pooling_dim 2d
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        RESULT_VARIABLE extract_ret_2d
+        OUTPUT_VARIABLE extract_output_2d
+        ERROR_VARIABLE extract_error_2d
+    )
+    if(NOT extract_ret_2d EQUAL 0)
+        message(WARNING "Failed to extract 2D test parameters for pooling ${datatype}: ${extract_error_2d}")
+        return()
+    endif()
+
+    set(test_params_file_3d "${working_path}/test_params_3d.hpp")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_SOURCE_DIR}/extract_test_params.py
+                --config_file ${json_blob}
+                --output_file ${test_params_file_3d}
+                --pooling_dim 3d
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        RESULT_VARIABLE extract_ret_3d
+        OUTPUT_VARIABLE extract_output_3d
+        ERROR_VARIABLE extract_error_3d
+    )
+    if(NOT extract_ret_3d EQUAL 0)
+        message(WARNING "Failed to extract 3D test parameters for pooling ${datatype}: ${extract_error_3d}")
+        return()
+    endif()
+
+    # STEP 2c: Header generation phase - generate headers using --gen_single
+    message(STATUS "  Generating pooling headers using --gen_single...")
+
+    file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines)
+    set(gen_count 0)
+
+    foreach(line IN LISTS kernel_lines)
+        # Parse kernel specification format: kernel_name|tile_config|trait_combo
+        string(REPLACE "|" ";" parts "${line}")
+        list(LENGTH parts parts_len)
+        if(parts_len EQUAL 3)
+            list(GET parts 0 kernel_name)
+            list(GET parts 1 tile_config)
+            list(GET parts 2 trait_combo)
+
+            # Generate header using --gen_single
+            execute_process(
+                COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_POOLING_DIR}/pooling_instance_builder.py
+                        --working_path ${working_path}
+                        --datatype ${datatype}
+                        --config_json ${json_blob}
+                        --gen_single
+                        --kernel_name "${kernel_name}"
+                        --tile_config "${tile_config}"
+                        --trait_combo "${trait_combo}"
+                WORKING_DIRECTORY ${TILE_ENGINE_POOLING_DIR}
+                RESULT_VARIABLE gen_ret
+                OUTPUT_VARIABLE gen_output
+                ERROR_VARIABLE gen_error
+            )
+
+            if(NOT gen_ret EQUAL 0)
+                message(WARNING "Failed to generate pooling header for ${kernel_name}: ${gen_error}")
+            else()
+                math(EXPR gen_count "${gen_count} + 1")
+            endif()
+        endif()
+    endforeach()
+
+    message(STATUS "  Generated ${gen_count} pooling headers for ${datatype}")
+
+    # STEP 3: Target creation phase - create test targets
+    message(STATUS "  Creating pooling test targets...")
+    file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines)
+    set(test_count 0)
+    foreach(line IN LISTS kernel_lines)
+        string(REPLACE "|" ";" parts "${line}")
+        list(LENGTH parts parts_len)
+        if(parts_len EQUAL 3)
+            list(GET parts 0 kernel_name)
+            list(GET parts 1 tile_config)
+            list(GET parts 2 trait_combo)
+
+            create_individual_pool_test_target("${datatype}" "${config_name}" "${kernel_name}" "${trait_combo}" "${tile_config}" "${json_blob}")
+            math(EXPR test_count "${test_count} + 1")
+        endif()
+    endforeach()
+    message(STATUS "  Created ${test_count} pooling test targets for ${datatype}")
+endfunction()
+
+# ============================================================================
+# MAIN EXECUTION - Test Target Generation
+# ============================================================================
+
+message(STATUS "=== Starting Pooling Tile Engine Test Configuration ===")
+message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+
+# GPU architecture filtering - only build tests for supported architectures
+set(POOLING_TEST_GPU_TARGETS "")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+
+foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+    if(target IN_LIST DESIRED_TARGETS)
+        list(APPEND POOLING_TEST_GPU_TARGETS ${target})
+        message(STATUS "  Adding GPU target for pooling tests: ${target}")
+    endif()
+endforeach()
+
+# Early exit if no compatible GPU architectures are available
+if(NOT POOLING_TEST_GPU_TARGETS)
+    message(WARNING "Skipping Pooling Tile Engine tests: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    return()
+endif()
+
+message(STATUS "Building Pooling tile engine tests for GPU targets: ${POOLING_TEST_GPU_TARGETS}")
+
+# Enable parallel compilation optimizations
+set_property(GLOBAL PROPERTY JOB_POOLS
+    compile_heavy=4
+    compile_normal=16
+)
+
+# Enable compiler cache if available and explicitly requested
+option(ENABLE_CCACHE_TESTS "Enable ccache for test compilation" OFF)
+if(ENABLE_CCACHE_TESTS)
+    find_program(CCACHE_PROGRAM ccache)
+    if(CCACHE_PROGRAM)
+        set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+        message(STATUS "Using ccache for faster test compilation")
+    else()
+        message(WARNING "ccache requested but not found")
+    endif()
+else()
+    message(STATUS "ccache disabled for tests (use -DENABLE_CCACHE_TESTS=ON to enable)")
+endif()
+
+# ============================================================================
+# Test Configuration Matrix
+# ============================================================================
+
+set(TEST_DATATYPES "fp16;fp32")
+
+# ============================================================================
+# Test Target Generation
+# ============================================================================
+
+# 1. SIMPLE TEST: Basic functionality validation (always built)
+set(SIMPLE_TEST_CONFIG "simple_test_config")
+set(SIMPLE_TEST_CONFIG_FILE "${CMAKE_CURRENT_SOURCE_DIR}/configs/${SIMPLE_TEST_CONFIG}.json")
+
+if(EXISTS ${SIMPLE_TEST_CONFIG_FILE})
+    message(STATUS "Processing pooling simple test config: ${SIMPLE_TEST_CONFIG}")
+    foreach(datatype IN LISTS TEST_DATATYPES)
+        build_pool_test_targets("${datatype}" "${SIMPLE_TEST_CONFIG}")
+    endforeach()
+else()
+    message(WARNING "Pooling simple test config file not found: ${SIMPLE_TEST_CONFIG_FILE}")
+endif()
+
+# 2. COVERAGE LEVEL: Quick or comprehensive testing
+#    Quick: ~2 kernels (1 tile config × 1 trait combo × fp16/fp32) from simple config only
+#    Comprehensive: ~200+ kernels with extensive tile sizes, warp configurations, and all trait combinations
+set(POOLING_COVERAGE_LEVEL "quick" CACHE STRING "Pooling coverage level: quick or comprehensive")
+set_property(CACHE POOLING_COVERAGE_LEVEL PROPERTY STRINGS "quick" "comprehensive")
+
+if(POOLING_COVERAGE_LEVEL STREQUAL "comprehensive")
+    set(COMPREHENSIVE_CONFIG "comprehensive_coverage_config")
+    set(COMPREHENSIVE_CONFIG_FILE "${CMAKE_CURRENT_SOURCE_DIR}/configs/${COMPREHENSIVE_CONFIG}.json")
+
+    if(EXISTS ${COMPREHENSIVE_CONFIG_FILE})
+        message(STATUS "Processing pooling comprehensive coverage config: ${COMPREHENSIVE_CONFIG}")
+        foreach(datatype IN LISTS TEST_DATATYPES)
+            build_pool_test_targets("${datatype}" "${COMPREHENSIVE_CONFIG}")
+        endforeach()
+    else()
+        message(WARNING "Pooling comprehensive config file not found: ${COMPREHENSIVE_CONFIG_FILE}")
+    endif()
+elseif(NOT POOLING_COVERAGE_LEVEL STREQUAL "quick")
+    message(FATAL_ERROR "Invalid POOLING_COVERAGE_LEVEL: ${POOLING_COVERAGE_LEVEL}. Must be 'quick' or 'comprehensive'")
+endif()
+
+message(STATUS "Pooling tile engine tests configured:")
+message(STATUS "  - Simple test: fp16/fp32 (always)")
+message(STATUS "  - Coverage level: ${POOLING_COVERAGE_LEVEL}")
+message(STATUS "    Use -DPOOLING_COVERAGE_LEVEL=comprehensive for extensive testing")
diff --git a/test/ck_tile/pooling_tile_engine/README.md b/test/ck_tile/pooling_tile_engine/README.md
new file mode 100644
index 0000000000..8364affbde
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/README.md
@@ -0,0 +1,87 @@
+# Pooling Tile Engine Tests
+
+Unit tests for pooling kernels generated by the tile_engine pooling codegen system.
+
+## Overview
+
+These tests validate pooling kernels that are generated at CMake configuration time
+by `pooling_instance_builder.py`. Each kernel configuration (tile shape + traits)
+gets its own GTest executable that verifies correctness against a CPU reference
+implementation.
+
+## Architecture
+
+
+```
+test/ck_tile/pooling_tile_engine/
+├── CMakeLists.txt                # Build infrastructure
+├── configs/
+│   └── simple_test_config.json   # Test configuration with problem sizes
+├── extract_test_params.py        # Extracts problem sizes to C++ header
+├── test_pooling_simple.cpp       # GTest driver (parameterized)
+└── README.md                     # This file
+```
+
+### Build Flow
+
+1. **CMake configuration**: `CMakeLists.txt` invokes `pooling_instance_builder.py --list_kernels`
+   to discover valid kernel configurations from the JSON config.
+2. **Parameter extraction**: `extract_test_params.py` generates `test_params.hpp` with
+   problem sizes from the JSON config.
+3. **Header generation**: For each kernel, `pooling_instance_builder.py --gen_single`
+   generates a C++ header defining `SelectedKernel` with the specific tile configuration.
+4. **Compilation**: Each kernel gets a separate test executable compiled with the
+   generated header via `-include`.
+5. **Execution**: GTest runs each problem size as a separate test case, comparing
+   device results against the CPU reference.
+
+## Configuration
+
+### `simple_test_config.json`
+
+Defines:
+- **tile_config**: Block/warp/thread tile dimensions for PoolShape
+- **trait_config**: Reduce op (max/avg), output_index, propagate_nan, pooling_dim (2d/3d)
+- **test_params**: Problem sizes (N, H, W, C, window, stride, dilation, padding)
+
+### Supported configurations
+
+- **Data types**: fp16, fp32
+- **Reduce operations**: max (with index output)
+- **Pooling dimensions**: 2D (NHWC), 3D (NDHWC)
+- **GPU targets**: gfx90a, gfx942
+
+## Building
+
+```bash
+# From the build directory:
+cmake --build . --target test_pooling_tile_engine_fp16_simple_test_config_max_true_false_2d_128x1_1x1_128x1_2x1
+
+# Or build all pooling tests:
+cmake --build . --target tests
+```
+
+## Running
+
+```bash
+# Run a specific test:
+./test_pooling_tile_engine_fp16_simple_test_config_max_true_false_2d_128x1_1x1_128x1_2x1
+
+# Run with GTest filters:
+./test_pooling_tile_engine_fp16_simple_test_config_max_true_false_2d_128x1_1x1_128x1_2x1 --gtest_filter="*BasicFunctionality*"
+```
+
+## Relationship to tile_engine
+
+The tile_engine pooling op lives at `tile_engine/ops/pooling/` and provides:
+- `pooling_instance_builder.py` - Codegen for kernel headers
+- `pooling_validation_utils.py` - Configuration validation
+- `pooling_common.hpp` - Shared trait definitions
+- `pooling_benchmark.hpp` - Problem/metric definitions
+- `pooling_benchmark_single.cpp` - Single-kernel benchmark entry point
+
+The underlying ck_tile pooling kernel lives at `include/ck_tile/ops/pooling/` and provides:
+- `PoolKernel` - GPU kernel implementation
+- `PoolProblem` - Problem parameterization
+- `PoolShape` - Tile shape specification
+- `PoolDefaultPolicy` - Tile distribution and reduction policies
diff --git a/test/ck_tile/pooling_tile_engine/configs/comprehensive_coverage_config.json b/test/ck_tile/pooling_tile_engine/configs/comprehensive_coverage_config.json
new file mode 100644
index 0000000000..0c9a6dfc7a
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/configs/comprehensive_coverage_config.json
@@ -0,0 +1,165 @@
+{
+    "problem": {
+        "description": "Comprehensive pooling coverage testing - multiple block sizes (64-512), warp configurations, thread tile sizes, and all trait combinations (max/avg, index, NaN propagation). Approximately 200+ kernels."
+    },
+    "test_params": {
+        "problem_sizes_2d": [
+            {
+                "_comment": "Basic: small tensor, 2x2 window, stride 2, no padding",
+                "N": 1, "H": 8, "W": 8, "C": 32,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Padded 3x3: moderate tensor with symmetric padding, stride 1 (overlapping)",
+                "N": 1, "H": 16, "W": 16, "C": 64,
+                "Y": 3, "X": 3,
+                "stride_h": 1, "stride_w": 1,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 1, "pad_h_right": 1,
+                "pad_w_left": 1, "pad_w_right": 1
+            },
+            {
+                "_comment": "Large channels: stress-test the C dimension",
+                "N": 1, "H": 16, "W": 16, "C": 256,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Large batch: multi-batch correctness",
+                "N": 4, "H": 16, "W": 16, "C": 32,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Non-square spatial: rectangular H != W",
+                "N": 2, "H": 32, "W": 16, "C": 64,
+                "Y": 3, "X": 3,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 1, "pad_h_right": 1,
+                "pad_w_left": 1, "pad_w_right": 1
+            },
+            {
+                "_comment": "Large window 5x5: bigger receptive field",
+                "N": 1, "H": 32, "W": 32, "C": 32,
+                "Y": 5, "X": 5,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 2, "pad_h_right": 2,
+                "pad_w_left": 2, "pad_w_right": 2
+            },
+            {
+                "_comment": "Large window 7x7: global-style pooling",
+                "N": 1, "H": 14, "W": 14, "C": 128,
+                "Y": 7, "X": 7,
+                "stride_h": 1, "stride_w": 1,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 3, "pad_h_right": 3,
+                "pad_w_left": 3, "pad_w_right": 3
+            },
+            {
+                "_comment": "Dilated: dilation_h=2, dilation_w=2 with 3x3 window",
+                "N": 1, "H": 32, "W": 32, "C": 64,
+                "Y": 3, "X": 3,
+                "stride_h": 1, "stride_w": 1,
+                "dilation_h": 2, "dilation_w": 2,
+                "pad_h_left": 2, "pad_h_right": 2,
+                "pad_w_left": 2, "pad_w_right": 2
+            },
+            {
+                "_comment": "Asymmetric padding: different left/right padding",
+                "N": 2, "H": 16, "W": 16, "C": 32,
+                "Y": 3, "X": 3,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 1,
+                "pad_w_left": 0, "pad_w_right": 1
+            },
+            {
+                "_comment": "Large spatial: bigger feature maps",
+                "N": 1, "H": 64, "W": 64, "C": 64,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Non-square window: Y != X",
+                "N": 1, "H": 32, "W": 32, "C": 32,
+                "Y": 3, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 1, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Stride-1 overlap: overlapping 2x2 windows",
+                "N": 2, "H": 16, "W": 16, "C": 64,
+                "Y": 2, "X": 2,
+                "stride_h": 1, "stride_w": 1,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            }
+        ],
+        "problem_sizes_3d": [
+            {
+                "_comment": "Basic 3D: small volume, 2x2x2 window",
+                "N": 1, "D": 4, "H": 4, "W": 4, "C": 32,
+                "Z": 2, "Y": 2, "X": 2,
+                "stride_d": 2, "stride_h": 2, "stride_w": 2,
+                "dilation_d": 1, "dilation_h": 1, "dilation_w": 1,
+                "pad_d_left": 0, "pad_d_right": 0,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Padded 3D: with symmetric padding",
+                "N": 1, "D": 8, "H": 8, "W": 8, "C": 32,
+                "Z": 3, "Y": 3, "X": 3,
+                "stride_d": 2, "stride_h": 2, "stride_w": 2,
+                "dilation_d": 1, "dilation_h": 1, "dilation_w": 1,
+                "pad_d_left": 1, "pad_d_right": 1,
+                "pad_h_left": 1, "pad_h_right": 1,
+                "pad_w_left": 1, "pad_w_right": 1
+            },
+            {
+                "_comment": "Multi-batch 3D: larger batch and channels",
+                "N": 2, "D": 8, "H": 8, "W": 8, "C": 64,
+                "Z": 2, "Y": 2, "X": 2,
+                "stride_d": 2, "stride_h": 2, "stride_w": 2,
+                "dilation_d": 1, "dilation_h": 1, "dilation_w": 1,
+                "pad_d_left": 0, "pad_d_right": 0,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            }
+        ]
+    },
+    "tile_config": {
+        "block_m": {"values": [64, 128, 256, 512]},
+        "block_n": {"values": [1]},
+        "warp_m": {"values": [1, 2, 4]},
+        "warp_n": {"values": [1]},
+        "warp_tile_m": {"values": [64, 128, 256]},
+        "warp_tile_n": {"values": [1]},
+        "thread_tile_m": {"values": [1, 2, 4]},
+        "thread_tile_n": {"values": [1]}
+    },
+    "trait_config": {
+        "reduce_op": {"values": ["max", "avg"]},
+        "output_index": {"values": [true, false]},
+        "propagate_nan": {"values": [true, false]},
+        "pooling_dim": {"values": ["2d", "3d"]}
+    }
+}
diff --git a/test/ck_tile/pooling_tile_engine/configs/simple_test_config.json b/test/ck_tile/pooling_tile_engine/configs/simple_test_config.json
new file mode 100644
index 0000000000..2ea9c376ce
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/configs/simple_test_config.json
@@ -0,0 +1,60 @@
+{
+    "problem": {
+        "description": "Basic pooling functionality validation with moderate problem sizes"
+    },
+    "test_params": {
+        "problem_sizes_2d": [
+            {
+                "N": 1, "H": 8, "W": 8, "C": 32,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "N": 2, "H": 16, "W": 16, "C": 32,
+                "Y": 3, "X": 3,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 1, "pad_h_right": 1,
+                "pad_w_left": 1, "pad_w_right": 1
+            },
+            {
+                "N": 1, "H": 32, "W": 32, "C": 64,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            }
+        ],
+        "problem_sizes_3d": [
+            {
+                "N": 1, "D": 4, "H": 4, "W": 4, "C": 32,
+                "Z": 2, "Y": 2, "X": 2,
+                "stride_d": 2, "stride_h": 2, "stride_w": 2,
+                "dilation_d": 1, "dilation_h": 1, "dilation_w": 1,
+                "pad_d_left": 0, "pad_d_right": 0,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            }
+        ]
+    },
+    "tile_config": {
+        "block_m": {"values": [128]},
+        "block_n": {"values": [1]},
+        "warp_m": {"values": [1]},
+        "warp_n": {"values": [1]},
+        "warp_tile_m": {"values": [128]},
+        "warp_tile_n": {"values": [1]},
+        "thread_tile_m": {"values": [2]},
+        "thread_tile_n": {"values": [1]}
+    },
+    "trait_config": {
+        "reduce_op": {"values": ["max"]},
+        "output_index": {"values": [true]},
+        "propagate_nan": {"values": [false]},
+        "pooling_dim": {"values": ["2d"]}
+    }
+}
\ No newline at end of file
diff --git a/test/ck_tile/pooling_tile_engine/extract_test_params.py b/test/ck_tile/pooling_tile_engine/extract_test_params.py
new file mode 100644
index 0000000000..86c809dd36
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/extract_test_params.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Extract pooling test parameters from config JSON and write to C++ header.
+Generates test_params.hpp with problem sizes for parameterized GTest.
+"""
+
+import json
+import argparse
+import os
+from pathlib import Path
+
+
+def extract_test_params(config_file, output_file, pooling_dim="2d"):
+    """Extract test parameters from config JSON and write to output file"""
+
+    with open(config_file, "r") as f:
+        config = json.load(f)
+
+    # Extract test parameters based on pooling dimension
+    test_params = []
+    if pooling_dim == "2d":
+        if "test_params" in config and "problem_sizes_2d" in config["test_params"]:
+            test_params = config["test_params"]["problem_sizes_2d"]
+        else:
+            # Default 2D test parameters
+            test_params = [
+                {
+                    "N": 1,
+                    "H": 8,
+                    "W": 8,
+                    "C": 32,
+                    "Y": 2,
+                    "X": 2,
+                    "stride_h": 2,
+                    "stride_w": 2,
+                    "dilation_h": 1,
+                    "dilation_w": 1,
+                    "pad_h_left": 0,
+                    "pad_h_right": 0,
+                    "pad_w_left": 0,
+                    "pad_w_right": 0,
+                },
+                {
+                    "N": 2,
+                    "H": 16,
+                    "W": 16,
+                    "C": 32,
+                    "Y": 3,
+                    "X": 3,
+                    "stride_h": 2,
+                    "stride_w": 2,
+                    "dilation_h": 1,
+                    "dilation_w": 1,
+                    "pad_h_left": 1,
+                    "pad_h_right": 1,
+                    "pad_w_left": 1,
+                    "pad_w_right": 1,
+                },
+            ]
+    else:  # 3d
+        if "test_params" in config and "problem_sizes_3d" in config["test_params"]:
+            test_params = config["test_params"]["problem_sizes_3d"]
+        else:
+            # Default 3D test parameters
+            test_params = [
+                {
+                    "N": 1,
+                    "D": 4,
+                    "H": 4,
+                    "W": 4,
+                    "C": 32,
+                    "Z": 2,
+                    "Y": 2,
+                    "X": 2,
+                    "stride_d": 2,
+                    "stride_h": 2,
+                    "stride_w": 2,
+                    "dilation_d": 1,
+                    "dilation_h": 1,
+                    "dilation_w": 1,
+                    "pad_d_left": 0,
+                    "pad_d_right": 0,
+                    "pad_h_left": 0,
+                    "pad_h_right": 0,
+                    "pad_w_left": 0,
+                    "pad_w_right": 0,
+                },
+            ]
+
+    # Write to output file in C++ format
+    output_dir = Path(output_file).parent
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    with open(output_file, "w") as f:
+        f.write("// Generated test parameters for pooling tile_engine tests\n")
+        f.write("// This file is auto-generated during CMake configuration\n\n")
+
+        if pooling_dim == "2d":
+            f.write(
+                "static const std::vector<PoolTestParams2D> CONFIG_TEST_PARAMS = {\n"
+            )
+            for i, params in enumerate(test_params):
+                comma = "," if i < len(test_params) - 1 else ""
+                f.write(
+                    f"    {{"
+                    f"{params['N']}, {params['H']}, {params['W']}, {params['C']}, "
+                    f"{params['Y']}, {params['X']}, "
+                    f"{params['stride_h']}, {params['stride_w']}, "
+                    f"{params['dilation_h']}, {params['dilation_w']}, "
+                    f"{params['pad_h_left']}, {params['pad_h_right']}, "
+                    f"{params['pad_w_left']}, {params['pad_w_right']}"
+                    f"}}{comma}\n"
+                )
+            f.write("};\n")
+        else:  # 3d
+            f.write(
+                "static const std::vector<PoolTestParams3D> CONFIG_TEST_PARAMS = {\n"
+            )
+            for i, params in enumerate(test_params):
+                comma = "," if i < len(test_params) - 1 else ""
+                f.write(
+                    f"    {{"
+                    f"{params['N']}, {params['D']}, {params['H']}, {params['W']}, {params['C']}, "
+                    f"{params['Z']}, {params['Y']}, {params['X']}, "
+                    f"{params['stride_d']}, {params['stride_h']}, {params['stride_w']}, "
+                    f"{params['dilation_d']}, {params['dilation_h']}, {params['dilation_w']}, "
+                    f"{params['pad_d_left']}, {params['pad_d_right']}, "
+                    f"{params['pad_h_left']}, {params['pad_h_right']}, "
+                    f"{params['pad_w_left']}, {params['pad_w_right']}"
+                    f"}}{comma}\n"
+                )
+            f.write("};\n")
+
+    print(
+        f"Extracted {len(test_params)} {pooling_dim} test parameters from {config_file} -> {output_file}"
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract pooling test parameters from config JSON"
+    )
+    parser.add_argument("--config_file", required=True, help="Input config JSON file")
+    parser.add_argument(
+        "--output_file", required=True, help="Output test parameters file"
+    )
+    parser.add_argument(
+        "--pooling_dim",
+        default="2d",
+        choices=["2d", "3d"],
+        help="Pooling dimension (2d or 3d)",
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.config_file):
+        print(f"Error: Config file not found: {args.config_file}")
+        return 1
+
+    extract_test_params(args.config_file, args.output_file, args.pooling_dim)
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp b/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp
new file mode 100644
index 0000000000..dd9cb2a84a
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp
@@ -0,0 +1,435 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file test_pooling_simple.cpp
+ * @brief Unit tests for pooling kernels generated by pooling_instance_builder
+ *
+ * This test includes kernels generated during CMake configuration by
+ * pooling_instance_builder.py and tests them with problem sizes extracted
+ * from the corresponding JSON configuration files.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/host/reference/reference_pool.hpp"
+#include "tile_engine/ops/pooling/pooling_common.hpp"
+
+// The kernel header is included via compile command line with -include flag
+// It defines: SelectedKernel, KERNEL_NAME, InDataType, OutDataType,
+//             ComputeDataType, IndexDataType, ReduceOpType,
+//             TensorShape, WindowShape, POOLING_DIM
+
+// ============================================================================
+// Test parameter structures
+// ============================================================================
+
+/// @brief Test parameters for 2D pooling
+struct PoolTestParams2D
+{
+    int N, H, W, C;              // Input dimensions (NHWC)
+    int Y, X;                    // Window size
+    int stride_h, stride_w;      // Strides
+    int dilation_h, dilation_w;  // Dilations
+    int pad_h_left, pad_h_right; // Height padding
+    int pad_w_left, pad_w_right; // Width padding
+};
+
+/// @brief Test parameters for 3D pooling
+struct PoolTestParams3D
+{
+    int N, D, H, W, C;                      // Input dimensions (NDHWC)
+    int Z, Y, X;                            // Window size
+    int stride_d, stride_h, stride_w;       // Strides
+    int dilation_d, dilation_h, dilation_w; // Dilations
+    int pad_d_left, pad_d_right;            // Depth padding
+    int pad_h_left, pad_h_right;            // Height padding
+    int pad_w_left, pad_w_right;            // Width padding
+};
+
+// Include config-specific test parameters (after parameter structs are defined)
+#ifdef POOLING_TEST_PARAMS_HPP
+#include POOLING_TEST_PARAMS_HPP
+#endif
+
+// POOLING_DIM_VALUE is set by CMake as a compile definition:
+//   2 for 2D pooling kernels, 3 for 3D pooling kernels.
+// This selects the appropriate test class and parameterization at compile time.
+
+#if POOLING_DIM_VALUE == 2
+// ============================================================================
+// 2D Pooling Tests
+// ============================================================================
+
+class PoolingTileEngineTest2D : public ::testing::TestWithParam<PoolTestParams2D>
+{
+    protected:
+    void SetUp() override
+    {
+        auto params  = GetParam();
+        N_           = params.N;
+        H_           = params.H;
+        W_           = params.W;
+        C_           = params.C;
+        Y_           = params.Y;
+        X_           = params.X;
+        stride_h_    = params.stride_h;
+        stride_w_    = params.stride_w;
+        dilation_h_  = params.dilation_h;
+        dilation_w_  = params.dilation_w;
+        pad_h_left_  = params.pad_h_left;
+        pad_h_right_ = params.pad_h_right;
+        pad_w_left_  = params.pad_w_left;
+        pad_w_right_ = params.pad_w_right;
+
+        // Calculate output dimensions
+        ck_tile::index_t Ys = (Y_ - 1) * dilation_h_ + 1;
+        ck_tile::index_t Xs = (X_ - 1) * dilation_w_ + 1;
+        Ho_                 = (H_ + pad_h_left_ + pad_h_right_ - Ys) / stride_h_ + 1;
+        Wo_                 = (W_ + pad_w_left_ + pad_w_right_ - Xs) / stride_w_ + 1;
+    }
+
+    int N_, H_, W_, C_;
+    int Y_, X_;
+    int stride_h_, stride_w_;
+    int dilation_h_, dilation_w_;
+    int pad_h_left_, pad_h_right_;
+    int pad_w_left_, pad_w_right_;
+    int Ho_, Wo_;
+};
+
+TEST_P(PoolingTileEngineTest2D, BasicFunctionality)
+{
+    // Create host tensors
+    ck_tile::HostTensor<InDataType> h_in({N_, H_, W_, C_});
+    ck_tile::HostTensor<OutDataType> h_out({N_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<OutDataType> h_out_ref({N_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<IndexDataType> h_out_index({N_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<IndexDataType> h_out_ref_index({N_, Ho_, Wo_, C_});
+
+    // Initialize input with random data
+    ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(h_in);
+    h_out.SetZero();
+    h_out_ref.SetZero();
+
+    // Device memory
+    ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes());
+
+    d_in.ToDevice(h_in.data());
+    d_out.SetZero();
+    d_out_index.SetZero();
+
+    // Build shapes and strides (NHWC layout)
+    auto input_shape      = ck_tile::make_tuple(N_, H_, W_, C_);
+    auto output_shape     = ck_tile::make_tuple(N_, Ho_, Wo_, C_);
+    auto input_strides    = ck_tile::make_tuple(H_ * W_ * C_, W_ * C_, C_, 1);
+    auto output_strides   = ck_tile::make_tuple(Ho_ * Wo_ * C_, Wo_ * C_, C_, 1);
+    auto window_lengths   = ck_tile::make_tuple(Y_, X_);
+    auto window_strides   = ck_tile::make_tuple(stride_h_, stride_w_);
+    auto window_dilations = ck_tile::make_tuple(dilation_h_, dilation_w_);
+    auto input_left_pads  = ck_tile::make_tuple(pad_h_left_, pad_w_left_);
+    auto input_right_pads = ck_tile::make_tuple(pad_h_right_, pad_w_right_);
+
+    // Build host args for the generated kernel
+    auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+        d_in.GetDeviceBuffer(),
+        d_out.GetDeviceBuffer(),
+        d_out_index.GetDeviceBuffer(),
+        input_shape,
+        output_shape,
+        input_strides,
+        output_strides,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        input_left_pads,
+        input_right_pads};
+
+    // Stream config: no timing overhead for fastest execution
+    ck_tile::stream_config stream_config{nullptr, false, 0, 0, 1, false, false, 1};
+
+    // Launch generated kernel
+    try
+    {
+        SelectedKernel::launch(host_args, stream_config);
+    }
+    catch(const std::exception& e)
+    {
+        std::string error_msg(e.what());
+        if(error_msg.find("Arguments not supported") != std::string::npos)
+        {
+            GTEST_SKIP() << "Configuration not supported: " << e.what();
+        }
+        else
+        {
+            FAIL() << "Kernel launch failed: " << e.what();
+        }
+    }
+
+    // Copy results back
+    d_out.FromDevice(h_out.data());
+    d_out_index.FromDevice(h_out_index.data());
+
+    // Compute reference on host
+    auto kernel_args_ref = ck_tile::PoolKernelArgs<decltype(input_shape), decltype(window_lengths)>{
+        h_in.data(),
+        h_out_ref.data(),
+        h_out_ref_index.data(),
+        input_shape,
+        output_shape,
+        input_strides,
+        output_strides,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        input_left_pads,
+        input_right_pads};
+
+    ck_tile::reference_pool2d<InDataType,
+                              ComputeDataType,
+                              OutDataType,
+                              IndexDataType,
+                              ReduceOpType,
+                              decltype(input_shape),
+                              decltype(window_lengths),
+                              SelectedKernel::kOutputIndex>(
+        h_in, h_out_ref, h_out_ref_index, kernel_args_ref, ReduceOpType{});
+
+    // Verify value results
+    bool pass_value = ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-5, 1e-5);
+    EXPECT_TRUE(pass_value) << "Pooling value verification failed for " << KERNEL_NAME;
+
+    // Verify index results if output_index is enabled
+    if constexpr(SelectedKernel::kOutputIndex)
+    {
+        bool pass_index =
+            ck_tile::check_err(h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0);
+        EXPECT_TRUE(pass_index) << "Pooling index verification failed for " << KERNEL_NAME;
+    }
+}
+
+TEST_P(PoolingTileEngineTest2D, KernelInfo)
+{
+    EXPECT_TRUE(std::string_view(KERNEL_NAME).size() > 0) << "Kernel name should not be empty";
+
+    std::cout << "Testing kernel: " << KERNEL_NAME << std::endl;
+    std::cout << "Problem size: N=" << N_ << " H=" << H_ << " W=" << W_ << " C=" << C_
+              << " Window=" << Y_ << "x" << X_ << " Output=" << Ho_ << "x" << Wo_ << std::endl;
+}
+
+// Instantiate test suite with config-specific test parameters
+// CONFIG_TEST_PARAMS is defined in the auto-generated test_params_2d.hpp file
+INSTANTIATE_TEST_SUITE_P(PoolingVerification,
+                         PoolingTileEngineTest2D,
+                         ::testing::ValuesIn(CONFIG_TEST_PARAMS),
+                         [](const ::testing::TestParamInfo<PoolTestParams2D>& param_info) {
+                             return "N" + std::to_string(param_info.param.N) + "_H" +
+                                    std::to_string(param_info.param.H) + "_W" +
+                                    std::to_string(param_info.param.W) + "_C" +
+                                    std::to_string(param_info.param.C) + "_Y" +
+                                    std::to_string(param_info.param.Y) + "_X" +
+                                    std::to_string(param_info.param.X);
+                         });
+
+#elif POOLING_DIM_VALUE == 3
+// ============================================================================
+// 3D Pooling Tests
+// ============================================================================
+
+class PoolingTileEngineTest3D : public ::testing::TestWithParam<PoolTestParams3D>
+{
+    protected:
+    void SetUp() override
+    {
+        auto params  = GetParam();
+        N_           = params.N;
+        D_           = params.D;
+        H_           = params.H;
+        W_           = params.W;
+        C_           = params.C;
+        Z_           = params.Z;
+        Y_           = params.Y;
+        X_           = params.X;
+        stride_d_    = params.stride_d;
+        stride_h_    = params.stride_h;
+        stride_w_    = params.stride_w;
+        dilation_d_  = params.dilation_d;
+        dilation_h_  = params.dilation_h;
+        dilation_w_  = params.dilation_w;
+        pad_d_left_  = params.pad_d_left;
+        pad_d_right_ = params.pad_d_right;
+        pad_h_left_  = params.pad_h_left;
+        pad_h_right_ = params.pad_h_right;
+        pad_w_left_  = params.pad_w_left;
+        pad_w_right_ = params.pad_w_right;
+
+        // Calculate output dimensions
+        ck_tile::index_t Zs = (Z_ - 1) * dilation_d_ + 1;
+        ck_tile::index_t Ys = (Y_ - 1) * dilation_h_ + 1;
+        ck_tile::index_t Xs = (X_ - 1) * dilation_w_ + 1;
+        Do_                 = (D_ + pad_d_left_ + pad_d_right_ - Zs) / stride_d_ + 1;
+        Ho_                 = (H_ + pad_h_left_ + pad_h_right_ - Ys) / stride_h_ + 1;
+        Wo_                 = (W_ + pad_w_left_ + pad_w_right_ - Xs) / stride_w_ + 1;
+    }
+
+    int N_, D_, H_, W_, C_;
+    int Z_, Y_, X_;
+    int stride_d_, stride_h_, stride_w_;
+    int dilation_d_, dilation_h_, dilation_w_;
+    int pad_d_left_, pad_d_right_;
+    int pad_h_left_, pad_h_right_;
+    int pad_w_left_, pad_w_right_;
+    int Do_, Ho_, Wo_;
+};
+
+TEST_P(PoolingTileEngineTest3D, BasicFunctionality)
+{
+    // Create host tensors (NDHWC layout)
+    ck_tile::HostTensor<InDataType> h_in({N_, D_, H_, W_, C_});
+    ck_tile::HostTensor<OutDataType> h_out({N_, Do_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<OutDataType> h_out_ref({N_, Do_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<IndexDataType> h_out_index({N_, Do_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<IndexDataType> h_out_ref_index({N_, Do_, Ho_, Wo_, C_});
+
+    // Initialize input with random data
+    ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(h_in);
+    h_out.SetZero();
+    h_out_ref.SetZero();
+
+    // Device memory
+    ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes());
+
+    d_in.ToDevice(h_in.data());
+    d_out.SetZero();
+    d_out_index.SetZero();
+
+    // Build shapes and strides (NDHWC layout)
+    auto input_shape   = ck_tile::make_tuple(N_, D_, H_, W_, C_);
+    auto output_shape  = ck_tile::make_tuple(N_, Do_, Ho_, Wo_, C_);
+    auto input_strides = ck_tile::make_tuple(D_ * H_ * W_ * C_, H_ * W_ * C_, W_ * C_, C_, 1);
+    auto output_strides =
+        ck_tile::make_tuple(Do_ * Ho_ * Wo_ * C_, Ho_ * Wo_ * C_, Wo_ * C_, C_, 1);
+    auto window_lengths   = ck_tile::make_tuple(Z_, Y_, X_);
+    auto window_strides   = ck_tile::make_tuple(stride_d_, stride_h_, stride_w_);
+    auto window_dilations = ck_tile::make_tuple(dilation_d_, dilation_h_, dilation_w_);
+    auto input_left_pads  = ck_tile::make_tuple(pad_d_left_, pad_h_left_, pad_w_left_);
+    auto input_right_pads = ck_tile::make_tuple(pad_d_right_, pad_h_right_, pad_w_right_);
+
+    // Build host args for the generated kernel
+    auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+        d_in.GetDeviceBuffer(),
+        d_out.GetDeviceBuffer(),
+        d_out_index.GetDeviceBuffer(),
+        input_shape,
+        output_shape,
+        input_strides,
+        output_strides,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        input_left_pads,
+        input_right_pads};
+
+    // Stream config: no timing overhead for fastest execution
+    ck_tile::stream_config stream_config{nullptr, false, 0, 0, 1, false, false, 1};
+
+    // Launch generated kernel
+    try
+    {
+        SelectedKernel::launch(host_args, stream_config);
+    }
+    catch(const std::exception& e)
+    {
+        std::string error_msg(e.what());
+        if(error_msg.find("Arguments not supported") != std::string::npos)
+        {
+            GTEST_SKIP() << "Configuration not supported: " << e.what();
+        }
+        else
+        {
+            FAIL() << "Kernel launch failed: " << e.what();
+        }
+    }
+
+    // Copy results back
+    d_out.FromDevice(h_out.data());
+    d_out_index.FromDevice(h_out_index.data());
+
+    // Compute reference on host
+    auto kernel_args_ref = ck_tile::PoolKernelArgs<decltype(input_shape), decltype(window_lengths)>{
+        h_in.data(),
+        h_out_ref.data(),
+        h_out_ref_index.data(),
+        input_shape,
+        output_shape,
+        input_strides,
+        output_strides,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        input_left_pads,
+        input_right_pads};
+
+    ck_tile::reference_pool3d<InDataType,
+                              ComputeDataType,
+                              OutDataType,
+                              IndexDataType,
+                              ReduceOpType,
+                              decltype(input_shape),
+                              decltype(window_lengths),
+                              SelectedKernel::kOutputIndex>(
+        h_in, h_out_ref, h_out_ref_index, kernel_args_ref, ReduceOpType{});
+
+    // Verify value results
+    bool pass_value = ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-5, 1e-5);
+    EXPECT_TRUE(pass_value) << "Pooling 3D value verification failed for " << KERNEL_NAME;
+
+    // Verify index results if output_index is enabled
+    if constexpr(SelectedKernel::kOutputIndex)
+    {
+        bool pass_index =
+            ck_tile::check_err(h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0);
+        EXPECT_TRUE(pass_index) << "Pooling 3D index verification failed for " << KERNEL_NAME;
+    }
+}
+
+TEST_P(PoolingTileEngineTest3D, KernelInfo)
+{
+    EXPECT_TRUE(std::string_view(KERNEL_NAME).size() > 0) << "Kernel name should not be empty";
+
+    std::cout << "Testing kernel: " << KERNEL_NAME << std::endl;
+    std::cout << "Problem size: N=" << N_ << " D=" << D_ << " H=" << H_ << " W=" << W_
+              << " C=" << C_ << " Window=" << Z_ << "x" << Y_ << "x" << X_ << " Output=" << Do_
+              << "x" << Ho_ << "x" << Wo_ << std::endl;
+}
+
+// Instantiate test suite with config-specific test parameters
+// CONFIG_TEST_PARAMS is defined in the auto-generated test_params_3d.hpp file
+INSTANTIATE_TEST_SUITE_P(PoolingVerification,
+                         PoolingTileEngineTest3D,
+                         ::testing::ValuesIn(CONFIG_TEST_PARAMS),
+                         [](const ::testing::TestParamInfo<PoolTestParams3D>& param_info) {
+                             return "N" + std::to_string(param_info.param.N) + "_D" +
+                                    std::to_string(param_info.param.D) + "_H" +
+                                    std::to_string(param_info.param.H) + "_W" +
+                                    std::to_string(param_info.param.W) + "_C" +
+                                    std::to_string(param_info.param.C) + "_Z" +
+                                    std::to_string(param_info.param.Z) + "_Y" +
+                                    std::to_string(param_info.param.Y) + "_X" +
+                                    std::to_string(param_info.param.X);
+                         });
+
+#else
+#error "POOLING_DIM_VALUE must be 2 or 3"
+#endif
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
index 8a5e0c74a0..66f427247a 100644
--- a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
index 9c08cf64f0..103f7281b0 100644
--- a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/test/ck_tile/utility/CMakeLists.txt b/test/ck_tile/utility/CMakeLists.txt
index 42bdb26e1d..2a377139b8 100644
--- a/test/ck_tile/utility/CMakeLists.txt
+++ b/test/ck_tile/utility/CMakeLists.txt
@@ -5,6 +5,7 @@ message("-- Adding: test/ck_tile/utility/")
 
 add_gtest_executable(test_fill test_fill.cpp)
 add_gtest_executable(test_ck_tile_sequence test_sequence.cpp)
+add_gtest_executable(test_ck_tile_static_ford test_static_ford.cpp)
 
 # Add print tests
 add_subdirectory(print)
diff --git a/test/ck_tile/utility/test_static_ford.cpp b/test/ck_tile/utility/test_static_ford.cpp
new file mode 100644
index 0000000000..7337471647
--- /dev/null
+++ b/test/ck_tile/utility/test_static_ford.cpp
@@ -0,0 +1,293 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <tuple>
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+
+using namespace ck_tile;
+
+// ============================================================================
+// static_ford Tests — Identity Order (default)
+// ============================================================================
+
+TEST(CkTileStaticFord, Identity2D)
+{
+    std::vector<std::pair<index_t, index_t>> visited;
+
+    static_ford<sequence<2, 3>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(visited.size(), 6u);
+    EXPECT_EQ(visited[0], std::make_pair(0, 0));
+    EXPECT_EQ(visited[1], std::make_pair(0, 1));
+    EXPECT_EQ(visited[2], std::make_pair(0, 2));
+    EXPECT_EQ(visited[3], std::make_pair(1, 0));
+    EXPECT_EQ(visited[4], std::make_pair(1, 1));
+    EXPECT_EQ(visited[5], std::make_pair(1, 2));
+}
+
+TEST(CkTileStaticFord, Identity3D)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    static_ford<sequence<2, 3, 2>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 12u);
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(0, 0, 1));
+    EXPECT_EQ(visited[2], std::make_tuple(0, 1, 0));
+    EXPECT_EQ(visited[3], std::make_tuple(0, 1, 1));
+    EXPECT_EQ(visited[4], std::make_tuple(0, 2, 0));
+    EXPECT_EQ(visited[5], std::make_tuple(0, 2, 1));
+    EXPECT_EQ(visited[6], std::make_tuple(1, 0, 0));
+    EXPECT_EQ(visited[7], std::make_tuple(1, 0, 1));
+    EXPECT_EQ(visited[8], std::make_tuple(1, 1, 0));
+    EXPECT_EQ(visited[9], std::make_tuple(1, 1, 1));
+    EXPECT_EQ(visited[10], std::make_tuple(1, 2, 0));
+    EXPECT_EQ(visited[11], std::make_tuple(1, 2, 1));
+}
+
+TEST(CkTileStaticFord, Identity1D)
+{
+    std::vector<index_t> visited;
+
+    static_ford<sequence<5>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        visited.push_back(i);
+    });
+
+    ASSERT_EQ(visited.size(), 5u);
+    for(index_t i = 0; i < 5; ++i)
+    {
+        EXPECT_EQ(visited[i], i);
+    }
+}
+
+TEST(CkTileStaticFord, SingleElement1D)
+{
+    std::vector<index_t> visited;
+
+    static_ford<sequence<1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        visited.push_back(i);
+    });
+
+    ASSERT_EQ(visited.size(), 1u);
+    EXPECT_EQ(visited[0], 0);
+}
+
+TEST(CkTileStaticFord, SingleElement2D)
+{
+    std::vector<std::pair<index_t, index_t>> visited;
+
+    static_ford<sequence<1, 1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(visited.size(), 1u);
+    EXPECT_EQ(visited[0], std::make_pair(0, 0));
+}
+
+TEST(CkTileStaticFord, IdentityWithUnitDim)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    static_ford<sequence<2, 1, 3>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 6u);
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(0, 0, 1));
+    EXPECT_EQ(visited[2], std::make_tuple(0, 0, 2));
+    EXPECT_EQ(visited[3], std::make_tuple(1, 0, 0));
+    EXPECT_EQ(visited[4], std::make_tuple(1, 0, 1));
+    EXPECT_EQ(visited[5], std::make_tuple(1, 0, 2));
+}
+
+// ============================================================================
+// static_ford Tests — Non-Identity Order (primary template with decompose_reordered)
+// ============================================================================
+
+TEST(CkTileStaticFord, ReversedOrder2D)
+{
+    std::vector<std::pair<index_t, index_t>> visited;
+
+    // Order (1, 0): dim 1 is outer, dim 0 is inner (column-major)
+    static_ford<sequence<2, 3>, sequence<1, 0>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(visited.size(), 6u);
+    EXPECT_EQ(visited[0], std::make_pair(0, 0));
+    EXPECT_EQ(visited[1], std::make_pair(1, 0));
+    EXPECT_EQ(visited[2], std::make_pair(0, 1));
+    EXPECT_EQ(visited[3], std::make_pair(1, 1));
+    EXPECT_EQ(visited[4], std::make_pair(0, 2));
+    EXPECT_EQ(visited[5], std::make_pair(1, 2));
+}
+
+TEST(CkTileStaticFord, CustomOrder3D_201)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    // Orders<2,0,1>: dim 2 outermost, dim 0 middle, dim 1 innermost
+    static_ford<sequence<2, 3, 4>, sequence<2, 0, 1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 24u);
+    // With orders (2,0,1): k varies slowest, then i, then j fastest
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(0, 1, 0));
+    EXPECT_EQ(visited[2], std::make_tuple(0, 2, 0));
+    EXPECT_EQ(visited[3], std::make_tuple(1, 0, 0));
+    EXPECT_EQ(visited[4], std::make_tuple(1, 1, 0));
+    EXPECT_EQ(visited[5], std::make_tuple(1, 2, 0));
+    EXPECT_EQ(visited[6], std::make_tuple(0, 0, 1));
+    EXPECT_EQ(visited[7], std::make_tuple(0, 1, 1));
+    // Tail: last element should be (1, 2, 3)
+    EXPECT_EQ(visited[23], std::make_tuple(1, 2, 3));
+}
+
+TEST(CkTileStaticFord, CustomOrder3D_120)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    // Orders<1,2,0>: dim 1 outermost, dim 2 middle, dim 0 innermost
+    static_ford<sequence<2, 3, 2>, sequence<1, 2, 0>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 12u);
+    // With orders (1,2,0): j varies slowest, then k, then i fastest
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(1, 0, 0));
+    EXPECT_EQ(visited[2], std::make_tuple(0, 0, 1));
+    EXPECT_EQ(visited[3], std::make_tuple(1, 0, 1));
+    EXPECT_EQ(visited[4], std::make_tuple(0, 1, 0));
+    EXPECT_EQ(visited[5], std::make_tuple(1, 1, 0));
+    // Tail: last element should be (1, 2, 1)
+    EXPECT_EQ(visited[11], std::make_tuple(1, 2, 1));
+}
+
+TEST(CkTileStaticFord, NonIdentityWithUnitDim)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    // Unit dim at position 1 with non-trivial order
+    static_ford<sequence<2, 1, 3>, sequence<2, 0, 1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 6u);
+    // All entries must have j == 0 (unit dimension)
+    for(size_t idx = 0; idx < visited.size(); ++idx)
+    {
+        EXPECT_EQ(std::get<1>(visited[idx]), 0) << "Unit dim not zero at iteration " << idx;
+    }
+}
+
+TEST(CkTileStaticFord, CustomOrder4D)
+{
+    std::vector<std::tuple<index_t, index_t, index_t, index_t>> visited;
+
+    // 4D with order <3,1,0,2>
+    static_ford<sequence<2, 3, 2, 4>, sequence<3, 1, 0, 2>>{}([&](auto multi_id) {
+        constexpr index_t a = multi_id[number<0>{}];
+        constexpr index_t b = multi_id[number<1>{}];
+        constexpr index_t c = multi_id[number<2>{}];
+        constexpr index_t d = multi_id[number<3>{}];
+        visited.emplace_back(a, b, c, d);
+    });
+
+    ASSERT_EQ(visited.size(), 48u);
+    // dim 3 (size 4) outermost, dim 1 (size 3) next, dim 0 (size 2) next, dim 2 (size 2) inner
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(0, 0, 1, 0));
+    EXPECT_EQ(visited[2], std::make_tuple(1, 0, 0, 0));
+    EXPECT_EQ(visited[3], std::make_tuple(1, 0, 1, 0));
+    EXPECT_EQ(visited[4], std::make_tuple(0, 1, 0, 0));
+    EXPECT_EQ(visited[5], std::make_tuple(0, 1, 1, 0));
+}
+
+TEST(CkTileStaticFord, AsymmetricDimsWithOrder)
+{
+    std::vector<std::pair<index_t, index_t>> visited;
+
+    // Asymmetric: 3x5 with reversed order
+    static_ford<sequence<3, 5>, sequence<1, 0>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(visited.size(), 15u);
+    // dim 1 (size 5) outer, dim 0 (size 3) inner
+    EXPECT_EQ(visited[0], std::make_pair(0, 0));
+    EXPECT_EQ(visited[1], std::make_pair(1, 0));
+    EXPECT_EQ(visited[2], std::make_pair(2, 0));
+    EXPECT_EQ(visited[3], std::make_pair(0, 1));
+    EXPECT_EQ(visited[4], std::make_pair(1, 1));
+    EXPECT_EQ(visited[5], std::make_pair(2, 1));
+}
+
+// ============================================================================
+// Consistency: identity order matches explicit identity order
+// ============================================================================
+
+TEST(CkTileStaticFord, IdentityOrderMatchesExplicit)
+{
+    std::vector<std::pair<index_t, index_t>> default_visited;
+    std::vector<std::pair<index_t, index_t>> explicit_visited;
+
+    static_ford<sequence<3, 4>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        default_visited.emplace_back(i, j);
+    });
+
+    static_ford<sequence<3, 4>, sequence<0, 1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        explicit_visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(default_visited.size(), explicit_visited.size());
+    for(size_t i = 0; i < default_visited.size(); ++i)
+    {
+        EXPECT_EQ(default_visited[i], explicit_visited[i]) << "Mismatch at iteration " << i;
+    }
+}
+
+// index_decomposer and inverse_perm are implementation details tested
+// indirectly through the static_ford behavioral tests above.
+// The IdentityOrderMatchesExplicit test verifies both code paths
+// (identity specialization and primary template) produce identical results.
diff --git a/test/grouped_convnd_bwd_data/test_grouped_conv_bwd_data_bilinear.cpp b/test/grouped_convnd_bwd_data/test_grouped_conv_bwd_data_bilinear.cpp
index ea7289d6bf..2f0aee37df 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_conv_bwd_data_bilinear.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_conv_bwd_data_bilinear.cpp
@@ -285,9 +285,14 @@ class TestGroupedConvndBwdData : public ::testing::Test
         bool pass = true;
         for(auto split_k : split_ks)
         {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
             {
-                pass = pass && PerformConvDataBilinear(param, split_k, instance_index);
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
+                pass        = pass && PerformConvDataBilinear(param, split_k, instance_index);
             }
         }
         EXPECT_TRUE(pass);
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
index 78cfe126a3..e0669914fe 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
@@ -31,8 +31,13 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                   InLayout,
                                                                                   WeiLayout,
diff --git a/test/grouped_gemm/test_grouped_gemm_fixed_nk_bias.cpp b/test/grouped_gemm/test_grouped_gemm_fixed_nk_bias.cpp
index 56d4051b79..63acfbb2e3 100644
--- a/test/grouped_gemm/test_grouped_gemm_fixed_nk_bias.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_fixed_nk_bias.cpp
@@ -18,8 +18,7 @@
 #include <type_traits>
 #include <vector>
 
-static ck::index_t param_mask     = 0xffffff;
-static ck::index_t instance_index = -1;
+static ck::index_t param_mask = 0xffffff;
 
 using FP32 = float;
 using FP16 = ck::half_t;
@@ -292,15 +291,14 @@ int main(int argc, char** argv)
 {
     testing::InitGoogleTest(&argc, argv);
     if(argc == 1) {}
-    else if(argc == 3)
+    else if(argc == 2)
     {
-        param_mask     = strtol(argv[1], nullptr, 0);
-        instance_index = atoi(argv[2]);
+        param_mask = strtol(argv[1], nullptr, 0);
     }
     else
     {
         std::cout << "Usage of " << argv[0] << std::endl;
-        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+        std::cout << "Arg1: param_mask " << std::endl;
     }
     return RUN_ALL_TESTS();
 }
diff --git a/test/grouped_gemm/test_grouped_gemm_multi_abd_fixed_nk.cpp b/test/grouped_gemm/test_grouped_gemm_multi_abd_fixed_nk.cpp
index 610e7f2b77..05caa6ed6d 100644
--- a/test/grouped_gemm/test_grouped_gemm_multi_abd_fixed_nk.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_multi_abd_fixed_nk.cpp
@@ -15,9 +15,6 @@
 
 #include "gtest/gtest.h"
 
-static ck::index_t param_mask     = 0xffffff;
-static ck::index_t instance_index = -1;
-
 using FP32 = float;
 using FP16 = ck::half_t;
 using BF16 = ck::bhalf_t;
@@ -238,19 +235,5 @@ TYPED_TEST(TestGroupedGemmMultiABDFixedNK, Regular)
 int main(int argc, char** argv)
 {
     testing::InitGoogleTest(&argc, argv);
-    if(argc == 1)
-    {
-        // Run with default arguments.
-    }
-    else if(argc == 3)
-    {
-        param_mask     = strtol(argv[1], nullptr, 0);
-        instance_index = atoi(argv[2]);
-    }
-    else
-    {
-        std::cout << "Usage of " << argv[0] << std::endl;
-        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
-    }
     return RUN_ALL_TESTS();
 }
diff --git a/tile_engine/CMakeLists.txt b/tile_engine/CMakeLists.txt
index b9dc320128..36f479d8e6 100644
--- a/tile_engine/CMakeLists.txt
+++ b/tile_engine/CMakeLists.txt
@@ -5,7 +5,8 @@ include_directories(BEFORE
         ${CMAKE_CURRENT_LIST_DIR}/include
     )
 
-add_subdirectory(ops/gemm)
-add_subdirectory(ops/gemm_streamk)
-add_subdirectory(ops/reduce)
+add_subdirectory(ops/gemm EXCLUDE_FROM_ALL)
+add_subdirectory(ops/gemm_streamk EXCLUDE_FROM_ALL)
+add_subdirectory(ops/pooling EXCLUDE_FROM_ALL)
+add_subdirectory(ops/reduce EXCLUDE_FROM_ALL)
 
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index ba5d34b9a2..94131f2cf1 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
-add_subdirectory(gemm_universal)
-add_subdirectory(gemm_multi_d)
-add_subdirectory(gemm_preshuffle)
-add_subdirectory(grouped_gemm)
\ No newline at end of file
+add_subdirectory(gemm_universal EXCLUDE_FROM_ALL)
+add_subdirectory(gemm_multi_d EXCLUDE_FROM_ALL)
+add_subdirectory(gemm_preshuffle EXCLUDE_FROM_ALL)
+add_subdirectory(grouped_gemm EXCLUDE_FROM_ALL)
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/gemm_multi_d/CMakeLists.txt b/tile_engine/ops/gemm/gemm_multi_d/CMakeLists.txt
index b5f9a4b177..0e0a54f66a 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/CMakeLists.txt
+++ b/tile_engine/ops/gemm/gemm_multi_d/CMakeLists.txt
@@ -231,7 +231,7 @@ message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx90a, gfx942, gfx950
 set(GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -242,7 +242,7 @@ endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine GEMM Multi D build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine GEMM Multi D build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
     message(VERBOSE "Building individual GEMM Multi D targets for GPU targets: ${GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL}")
 
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/CMakeLists.txt b/tile_engine/ops/gemm/gemm_preshuffle/CMakeLists.txt
index ad93007fe3..c6ca819a70 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/CMakeLists.txt
+++ b/tile_engine/ops/gemm/gemm_preshuffle/CMakeLists.txt
@@ -219,7 +219,7 @@ message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx90a, gfx942, and gfx950
 set(GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -230,7 +230,7 @@ endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
     message(VERBOSE "Building individual GEMM Preshuffle targets for GPU targets: ${GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL}")
 
diff --git a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
index 7505fcd6d0..df93f1a4ee 100644
--- a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
+++ b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
@@ -226,7 +226,7 @@ message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx90a, gfx942, gfx950, gfx1201
 set(GEMM_UNIVERSAL_GPU_TARGETS_INDIVIDUAL "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -237,7 +237,7 @@ endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_UNIVERSAL_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine GEMM Universal build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine GEMM Universal build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
     message(VERBOSE "Building individual GEMM Universal targets for GPU targets: ${GEMM_UNIVERSAL_GPU_TARGETS_INDIVIDUAL}")
 
diff --git a/tile_engine/ops/gemm/gemm_validation_utils.py b/tile_engine/ops/gemm/gemm_validation_utils.py
index 1af45f8e90..aa3c04cf95 100644
--- a/tile_engine/ops/gemm/gemm_validation_utils.py
+++ b/tile_engine/ops/gemm/gemm_validation_utils.py
@@ -25,6 +25,16 @@ ELEMENT_SIZE_MAP = {
     "fp64": 8,
 }
 
+def get_warp_size_for_gpu(gpu_target: str) -> int:
+    """Get the warp size for a given GPU target.
+
+    CDNA architectures (gfx9xx) use WAVE64 (64 threads per wavefront).
+    RDNA architectures (gfx10xx, gfx11xx, gfx12xx) use WAVE32 (32 threads per wavefront).
+    """
+    if gpu_target.startswith("gfx9"):
+        return 64  # CDNA - WAVE64
+    return 32  # RDNA and others - WAVE32
+
 WARP_SUPPORTED_COMBINATIONS = {
     "gfx90a": [
         [1, 4, 1],
@@ -586,10 +596,11 @@ def validate_whole_wg_cover_configuration(
     layout,
     a_datatype,
     b_datatype,
+    gpu_target: str = "gfx90a",
 ) -> Tuple[bool, str]:
     # Validate whole workgroup cover configuration
 
-    warp_size = 64
+    warp_size = get_warp_size_for_gpu(gpu_target)
     NumWarps = warp_m * warp_n * warp_k
     BlockSize = NumWarps * warp_size
 
@@ -704,6 +715,73 @@ def wg_cover_core_validation(
     return True, ""
 
 
+def validate_cshuffle_epilogue_distribution(
+    tile_m: int,
+    tile_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_k: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    warp_size: int,
+    c_datatype: str,
+) -> Tuple[bool, str]:
+    """
+    Validate that the CShuffleEpilogue tile distribution pattern is valid.
+
+    This mirrors the static_assert in static_encoding_pattern.hpp:
+        static_assert(X0 * Y1 == warp_size, "X0 * Y1 must cover whole wavefront!");
+
+    The CShuffleEpilogue creates a tile_distribution_encoding_pattern_2d<BlockSize, YPerTile, XPerTile, VecSize, thread_raked>
+    where:
+    - BlockSize = warp_m * warp_n * warp_k * warp_size
+    - YPerTile = MPerIterationShuffle (derived from tile_m / (warp_m * warp_tile_m / some_factor))
+    - XPerTile = NPerIterationShuffle (derived from tile_n)
+    - VecSize = vector size based on element size (typically 8 for fp16)
+
+    The key constraint is that X0 must evenly divide warp_size, where:
+    - X0 = min(warp_size, XPerTile / X1)
+    - X1 = min(VecSize, LargestVec)
+    - LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size)
+    """
+    NumWarps = warp_m * warp_n * warp_k
+    BlockSize = NumWarps * warp_size
+
+    elem_size = ELEMENT_SIZE_MAP.get(c_datatype, 2)
+    VecSize = 16 // elem_size
+
+    XPerTile = tile_n
+    YPerTile = tile_m // warp_m
+
+    if XPerTile <= 0 or YPerTile <= 0:
+        return False, f"Invalid tile dimensions: XPerTile={XPerTile}, YPerTile={YPerTile}"
+
+    num_warps = BlockSize // warp_size
+    if num_warps * warp_size == 0:
+        return False, "Invalid BlockSize or warp_size"
+
+    LargestVec = (XPerTile * YPerTile) // (num_warps * warp_size)
+    if LargestVec <= 0:
+        LargestVec = 1
+
+    X1 = min(VecSize, LargestVec) if LargestVec > 0 else VecSize
+    if X1 <= 0:
+        X1 = 1
+
+    X0 = min(warp_size, XPerTile // X1) if X1 > 0 else warp_size
+
+    Y1 = warp_size // X0 if X0 > 0 else 0
+
+    if X0 * Y1 != warp_size:
+        return (
+            False,
+            f"CShuffleEpilogue distribution invalid: X0({X0}) * Y1({Y1}) = {X0 * Y1} != warp_size({warp_size}). "
+            f"XPerTile={XPerTile}, YPerTile={YPerTile}, VecSize={VecSize}, BlockSize={BlockSize}"
+        )
+
+    return True, ""
+
+
 def get_global_vector_load_size(
     BlockSize: int,
     KPerBlock: int,
@@ -766,6 +844,8 @@ def validate_gemm(
     trait_name: str = None,
 ) -> bool:
     # GEMM Validation
+    warp_size = get_warp_size_for_gpu(gpu_target)
+
     # Validate whole workgroup cover configuration
     whole_workgroup_cover_valid, whole_workgroup_cover_error = (
         validate_whole_wg_cover_configuration(
@@ -778,6 +858,7 @@ def validate_gemm(
             layout,
             a_datatype,
             b_datatype,
+            gpu_target,
         )
     )
     if not whole_workgroup_cover_valid:
@@ -786,6 +867,23 @@ def validate_gemm(
         )
         return False, whole_workgroup_cover_error
 
+    # Validate CShuffleEpilogue distribution pattern (for cshuffle epilogue)
+    # This validation ensures the tile distribution pattern is valid for the output tile
+    cshuffle_valid, cshuffle_error = validate_cshuffle_epilogue_distribution(
+        tile_m,
+        tile_n,
+        warp_m,
+        warp_n,
+        warp_k,
+        warp_tile_m,
+        warp_tile_n,
+        warp_size,
+        c_datatype,
+    )
+    if not cshuffle_valid:
+        logging.debug(f"CShuffleEpilogue validation failed: {cshuffle_error}")
+        return False, cshuffle_error
+
     return True, ""
 
 
@@ -808,6 +906,8 @@ def validate_gemm_preshuffle(
     trait_name: str = None,
 ) -> bool:
     # Preshuffle Validations
+    warp_size = get_warp_size_for_gpu(gpu_target)
+
     # Validate vector load alignment
     m_iter_per_warp = tile_m / (warp_m * warp_tile_m)
     vector_valid, vector_error = validate_vector_load_alignment(
@@ -815,7 +915,7 @@ def validate_gemm_preshuffle(
         warp_tile_k,
         a_datatype,
         m_iter_per_warp,
-        wave_size=64,
+        wave_size=warp_size,
         vector_load_size=16,
     )
     if not vector_valid:
@@ -831,7 +931,7 @@ def validate_gemm_preshuffle(
         warp_k,
         a_datatype,
         vector_load_size=16,
-        warp_size=64,
+        warp_size=warp_size,
     )
     if not m0_m1_m2_valid:
         logging.debug(f"M0/M1/M2 configuration validation failed: {m0_m1_m2_error}")
diff --git a/tile_engine/ops/gemm/grouped_gemm/CMakeLists.txt b/tile_engine/ops/gemm/grouped_gemm/CMakeLists.txt
index a902c91d23..7cd27e04fb 100644
--- a/tile_engine/ops/gemm/grouped_gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/grouped_gemm/CMakeLists.txt
@@ -226,7 +226,7 @@ message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx942, gfx950
 set(GROUPED_GEMM_GPU_TARGETS_INDIVIDUAL "")
-set(DESIRED_TARGETS "gfx942;gfx950")
+set(DESIRED_TARGETS "gfx942;gfx950;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -237,7 +237,7 @@ endforeach()
 
 # Skip build if no matching targets found
 if(NOT GROUPED_GEMM_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine Grouped GEMM build: No supported GPU targets (gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine Grouped GEMM build: No supported GPU targets (gfx942, gfx950, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
     message(VERBOSE "Building individual Grouped GEMM targets for GPU targets: ${GROUPED_GEMM_GPU_TARGETS_INDIVIDUAL}")
 
diff --git a/tile_engine/ops/gemm_streamk/CMakeLists.txt b/tile_engine/ops/gemm_streamk/CMakeLists.txt
index ae453ea11b..8ddf6ce39d 100644
--- a/tile_engine/ops/gemm_streamk/CMakeLists.txt
+++ b/tile_engine/ops/gemm_streamk/CMakeLists.txt
@@ -2,8 +2,8 @@
 # SPDX-License-Identifier: MIT
 
 set(GEMM_STREAMK_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)")
-set(GEMM_STREAMK_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)")
-set(GEMM_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
+set(GEMM_STREAMK_LAYOUT "rcr;rrr;crr;ccr" CACHE STRING "List of layout for GEMM (semicolon-separated)")
+set(GEMM_STREAMK_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
 option(ENABLE_CCACHE_GEMM "Enable ccache for GEMM ops compilation" OFF)
 
 # Store the directory path for use in functions
@@ -116,23 +116,23 @@ function(build_individual_gemm_targets datatype layout)
     
     # Choose config file
     # Priority order:
-    # 1. Environment variable GEMM_CONFIG_FILE
-    # 2. CMake variable GEMM_CONFIG_FILE  
+    # 1. Environment variable GEMM_STREAMK_CONFIG_FILE
+    # 2. CMake variable GEMM_STREAMK_CONFIG_FILE  
     # 3. Default based on layout
     
     # Check environment variable first
-    if(DEFINED ENV{GEMM_CONFIG_FILE} AND NOT "$ENV{GEMM_CONFIG_FILE}" STREQUAL "")
-        set(config_filename "$ENV{GEMM_CONFIG_FILE}")
+    if(DEFINED ENV{GEMM_STREAMK_CONFIG_FILE} AND NOT "$ENV{GEMM_STREAMK_CONFIG_FILE}" STREQUAL "")
+        set(config_filename "$ENV{GEMM_STREAMK_CONFIG_FILE}")
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
-        message(STATUS "  Using config from environment variable: ${config_filename}")
-    elseif(NOT "${GEMM_CONFIG_FILE}" STREQUAL "")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
+    elseif(NOT "${GEMM_STREAMK_CONFIG_FILE}" STREQUAL "")
         # Use CMake variable if set
-        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_CONFIG_FILE}")
-        message(STATUS "  Using custom config: ${GEMM_CONFIG_FILE}")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_STREAMK_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${GEMM_STREAMK_CONFIG_FILE}")
     else()
         # Use default config for all layouts
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
-        message(STATUS "  Using default config for layout ${layout}")
+        message(VERBOSE "  Using default config for layout ${layout}")
     endif()
     
     # Check if config file exists
@@ -153,17 +153,17 @@ function(build_individual_gemm_targets datatype layout)
     endif()
     
     # Generate individual kernel files using parallel version
-    message(STATUS "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
-    message(STATUS "  Working path: ${working_path}")
-    message(STATUS "  Config file: ${json_blob}")
-    message(STATUS "  Python executable: ${Python3_EXECUTABLE}")
-    message(STATUS "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_streamk_instance_builder.py")
+    message(VERBOSE "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
+    message(VERBOSE "  Working path: ${working_path}")
+    message(VERBOSE "  Config file: ${json_blob}")
+    message(VERBOSE "  Python executable: ${Python3_EXECUTABLE}")
+    message(VERBOSE "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_streamk_instance_builder.py")
     
     # Create working directory first
     file(MAKE_DIRECTORY ${working_path})
     
     # First, just list the kernels (fast operation)
-    message(STATUS "  Listing kernel configurations...")
+    message(VERBOSE "  Listing kernel configurations...")
     execute_process(
         COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_streamk_instance_builder.py
                 --working_path ${working_path}
@@ -185,7 +185,7 @@ function(build_individual_gemm_targets datatype layout)
     if(EXISTS ${working_path}/gemm_kernel_count.txt)
         file(READ ${working_path}/gemm_kernel_count.txt kernel_count)
         string(STRIP "${kernel_count}" kernel_count)
-        message(STATUS "  Found ${kernel_count} kernel configurations")
+        message(VERBOSE "  Found ${kernel_count} kernel configurations")
     else()
         message(FATAL_ERROR "Kernel count file not found")
     endif()
@@ -209,10 +209,10 @@ function(build_individual_gemm_targets datatype layout)
 endfunction()
 
 # Main build logic - Only individual builds supported
-message(STATUS "=== Starting Tile Engine StreamK GEMM Configuration ===")
-message(STATUS "GEMM_STREAMK_DATATYPE: ${GEMM_STREAMK_DATATYPE}")
-message(STATUS "GEMM_STREAMK_LAYOUT: ${GEMM_STREAMK_LAYOUT}")
-message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+message(VERBOSE "=== Starting Tile Engine StreamK GEMM Configuration ===")
+message(VERBOSE "GEMM_STREAMK_DATATYPE: ${GEMM_STREAMK_DATATYPE}")
+message(VERBOSE "GEMM_STREAMK_LAYOUT: ${GEMM_STREAMK_LAYOUT}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx90a, gfx942
 set(GEMM_GPU_TARGETS_INDIVIDUAL "")
@@ -221,15 +221,15 @@ set(DESIRED_TARGETS "gfx90a;gfx942") # TODO: Add gfx950 when supported
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
         list(APPEND GEMM_GPU_TARGETS_INDIVIDUAL ${target})
-        message(STATUS "  Adding GPU target: ${target}")
+        message(VERBOSE "  Adding GPU target: ${target}")
     endif()
 endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
-    message(STATUS "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")
+    message(VERBOSE "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")
 
     # Enable parallel compilation optimizations
     # Set up job pools for better parallel compilation control
@@ -244,12 +244,12 @@ else()
         find_program(CCACHE_PROGRAM ccache)
         if(CCACHE_PROGRAM)
             set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-            message(STATUS "Using ccache for faster compilation")
+            message(VERBOSE "Using ccache for faster compilation")
         else()
             message(WARNING "ccache requested but not found")
         endif()
     else()
-        message(STATUS "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
+        message(VERBOSE "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
     endif()
 
     # Create master collection targets
diff --git a/tile_engine/ops/gemm_streamk/configs/default_config.json b/tile_engine/ops/gemm_streamk/configs/default_config.json
index 07281bdf9a..96c5571552 100644
--- a/tile_engine/ops/gemm_streamk/configs/default_config.json
+++ b/tile_engine/ops/gemm_streamk/configs/default_config.json
@@ -98,7 +98,7 @@
         },
         "reduction_strategy": {
             "values": [
-                "atomic"
+                "atomic", "linear", "tree"
             ]
         }
     }
diff --git a/tile_engine/ops/gemm_streamk/gemm_streamk_benchmark.py b/tile_engine/ops/gemm_streamk/gemm_streamk_benchmark.py
new file mode 100644
index 0000000000..ad8d9ff35c
--- /dev/null
+++ b/tile_engine/ops/gemm_streamk/gemm_streamk_benchmark.py
@@ -0,0 +1,676 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+
+class GemmBenchmark:
+    def __init__(self, build_dir: str, verbose: bool = False):
+        self.build_dir = Path(build_dir)
+        self.verbose = verbose
+        self.results = []
+
+    def discover_kernels(self) -> List[Path]:
+        """Find all benchmark_gemm_streamk_* executables in the build directory"""
+        bin_dir = self.build_dir / "bin"
+        if not bin_dir.exists():
+            print(f"Error: Binary directory {bin_dir} does not exist")
+            return []
+
+        kernels = list(bin_dir.glob("benchmark_gemm_streamk_*"))
+        if self.verbose:
+            print(f"Found {len(kernels)} kernel executables")
+            for k in kernels:
+                print(f"  - {k.name}")
+        return kernels
+
+    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
+        """Extract comprehensive kernel information from filename"""
+        name = kernel_path.stem
+
+        # Initialize with basic info
+        info = {
+            "executable": str(kernel_path),
+            "name": name,
+            "data_type": "unknown",
+            "layout": "unknown",
+            "pipeline": "unknown",
+            "scheduler": "unknown",
+            "epilogue": "unknown",
+            "reduction_strategy": "unknown",
+        }
+
+        # Parse the kernel name pattern:
+        # benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
+        parts = name.split("_")
+
+        if len(parts) >= 3:
+            # Extract data type (4th part after benchmark_gemm_streamk)
+            info["data_type"] = parts[3] if len(parts) > 3 else "unknown"
+
+            # Extract layout (5th part)
+            info["layout"] = parts[4] if len(parts) > 4 else "unknown"
+
+            # Extract pipeline (6th part)
+            info["pipeline"] = parts[5] if len(parts) > 5 else "unknown"
+
+            # Extract epilogue (7th part)
+            info["epilogue"] = parts[6] if len(parts) > 6 else "unknown"
+
+            # Extract scheduler (8th part)
+            info["scheduler"] = parts[7] if len(parts) > 7 else "unknown"
+            
+            # Extract reduction strategy (9th part)
+            info["reduction_strategy"] = parts[8] if len(parts) > 8 else "unknown"
+
+        # Extract detailed configuration from the end of the name
+        config_info = self.parse_detailed_config(name)
+        info.update(config_info)
+
+        # Generate config ID
+        info["config_id"] = self.generate_config_id(info)
+
+        return info
+
+    def parse_detailed_config(self, kernel_name: str) -> Dict:
+        """Parse detailed configuration from kernel name"""
+        config = {
+            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
+            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
+            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
+            "optimization_flags": {
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+            },
+        }
+
+        # Split by underscore and look for patterns
+        parts = kernel_name.split("_")
+
+        # Look for boolean flags (sequence of True/False values)
+        bool_sequence = []
+        for i, part in enumerate(parts):
+            if part in ["True", "False"]:
+                bool_sequence.append(part == "True")
+                # Continue collecting consecutive boolean values
+                j = i + 1
+                while j < len(parts) and parts[j] in ["True", "False"]:
+                    bool_sequence.append(parts[j] == "True")
+                    j += 1
+                break
+
+        # Assign boolean flags if we found them
+        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
+        if len(bool_sequence) >= 4:
+            config["optimization_flags"]["pad_m"] = bool_sequence[0]
+            config["optimization_flags"]["pad_n"] = bool_sequence[1]
+            config["optimization_flags"]["pad_k"] = bool_sequence[2]
+            config["optimization_flags"]["persistent"] = bool_sequence[3]
+
+        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
+        # The pattern is: tile_sizes_warp_config_warp_tile
+        dimension_groups = []
+        for part in parts:
+            if "x" in part and len(part.split("x")) == 3:
+                try:
+                    dims = [int(x) for x in part.split("x")]
+                    if all(d > 0 for d in dims):
+                        dimension_groups.append(dims)
+                except ValueError:
+                    continue
+
+        # Assign dimensions based on order and magnitude
+        if len(dimension_groups) >= 3:
+            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Largest dimensions = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smallest dimensions = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[2][0]
+            config["warp_config"]["warp_n"] = sorted_groups[2][1]
+            config["warp_config"]["warp_k"] = sorted_groups[2][2]
+
+            # Middle dimensions = warp tile
+            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
+            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
+            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 2:
+            # If only 2 groups, assign based on magnitude
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Larger = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smaller = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[1][0]
+            config["warp_config"]["warp_n"] = sorted_groups[1][1]
+            config["warp_config"]["warp_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 1:
+            # Only one group - assume it's tile sizes
+            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
+            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
+            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
+
+        return config
+
+    def generate_config_id(self, info: Dict) -> str:
+        """Generate a compact config ID from kernel info"""
+        # Create a compact identifier
+        parts = [
+            info.get("data_type", "unk"),
+            info.get("layout", "unk"),
+            info.get("pipeline", "unk"),
+            info.get("scheduler", "unk"),
+            info.get("reduction_strategy", "unk"),
+        ]
+
+        # Add tile configuration if available
+        tile_sizes = info.get("tile_sizes", {})
+        if tile_sizes.get("tile_m", 0) > 0:
+            tile_str = (
+                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
+            )
+            parts.append(tile_str)
+
+        # Add warp config if available
+        warp_config = info.get("warp_config", {})
+        if warp_config.get("warp_m", 0) > 0:
+            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
+            parts.append(warp_str)
+
+        # Add warp tile if available
+        warp_tile = info.get("warp_tile", {})
+        if warp_tile.get("warp_tile_m", 0) > 0:
+            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
+            parts.append(warp_tile_str)
+
+        return "_".join(parts)
+
+    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
+        """Run a single kernel with given parameters and save output to individual JSON file"""
+        # Create results directory
+        results_dir = self.build_dir / "results"
+        results_dir.mkdir(exist_ok=True)
+
+        # Generate unique JSON filename for this kernel
+        json_file = results_dir / f"{kernel_path.stem}.json"
+
+        cmd = [str(kernel_path)]
+
+        # Add parameters
+        for key, value in params.items():
+            cmd.append(f"-{key}={value}")
+
+        # Add JSON output flag for clean JSON output
+        cmd.append("-json_output=true")
+
+        if self.verbose:
+            print(f"Running: {' '.join(cmd)}")
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+
+            if result.returncode != 0:
+                print(f"Error running {kernel_path.name}: {result.stderr}")
+                return None
+
+            # Save raw output to individual JSON file
+            output = result.stdout.strip()
+            if output:
+                with open(json_file, "w") as f:
+                    f.write(output)
+
+                # Parse the JSON file
+                return self.parse_json_file(json_file)
+            else:
+                print(f"No output from {kernel_path.name}")
+                return None
+
+        except subprocess.TimeoutExpired:
+            print(f"Timeout running {kernel_path.name}")
+            return None
+        except Exception as e:
+            print(f"Error running {kernel_path.name}: {e}")
+            return None
+
+    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
+        """Parse JSON data from individual kernel output file"""
+        try:
+            with open(json_file, "r") as f:
+                content = f.read().strip()
+
+            # Parse the JSON directly since executables produce clean JSON
+            data = json.loads(content)
+
+            # Return the complete JSON data as-is, just add some convenience fields
+            result = data.copy()
+            if "perf_result" in data:
+                perf = data["perf_result"]
+                # Add convenience fields for backward compatibility
+                result["time_ms"] = perf.get("latency(ms)", 0)
+                result["tflops"] = perf.get("tflops(TFlops)", 0)
+                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
+
+            return result
+
+        except json.JSONDecodeError as e:
+            if self.verbose:
+                print(f"Failed to parse JSON from {json_file}: {e}")
+            return None
+        except Exception as e:
+            if self.verbose:
+                print(f"Error reading JSON file {json_file}: {e}")
+            return None
+
+    def benchmark_problem_size(
+        self,
+        kernels: List[Path],
+        m: int,
+        n: int,
+        k: int,
+        verify: int = 0,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> List[Dict]:
+        """Benchmark all kernels for a specific problem size"""
+        results = []
+
+        params = {
+            "m": m,
+            "n": n,
+            "k": k,
+            "verify": verify,
+            "warmup": warmup,
+            "repeat": repeat,
+            "flush_cache": str(flush_cache).lower(),
+            "rotating_count": rotating_count,
+        }
+
+        print(f"\nBenchmarking M={m}, N={n}, K={k}")
+
+        for kernel_path in kernels:
+            kernel_info = self.extract_kernel_info(kernel_path)
+            result = self.run_kernel(kernel_path, params)
+
+            if result:
+                # Create new structured result format
+                structured_result = {
+                    "name": kernel_info["name"],  # Add name field for compatibility
+                    "config_id": kernel_info["config_id"],
+                    "problem": result.get("problem", {}),
+                    "perf_result": result.get("perf_result", {}),
+                    "config": {
+                        "data_type": kernel_info["data_type"],
+                        "layout": kernel_info["layout"],
+                        "pipeline": kernel_info["pipeline"],
+                        "scheduler": kernel_info["scheduler"],
+                        "epilogue": kernel_info["epilogue"],
+                        "reduction_strategy": kernel_info["reduction_strategy"],
+                        "tile_sizes": kernel_info.get("tile_sizes", {}),
+                        "warp_config": kernel_info.get("warp_config", {}),
+                        "warp_tile": kernel_info.get("warp_tile", {}),
+                        "optimization_flags": kernel_info.get("optimization_flags", {}),
+                    },
+                    "executable": kernel_info["executable"],
+                    # Keep backward compatibility fields
+                    "time_ms": result.get("time_ms", 0),
+                    "tflops": result.get("tflops", 0),
+                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
+                }
+
+                results.append(structured_result)
+
+                if self.verbose:
+                    print(
+                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
+                    )
+
+        return results
+
+    def find_best_kernel(
+        self, results: List[Dict], metric: str = "tflops"
+    ) -> Optional[Dict]:
+        """Find the best performing kernel based on metric"""
+        if not results:
+            return None
+
+        if metric == "tflops":
+            return max(results, key=lambda x: x.get("tflops", 0))
+        elif metric == "time_ms":
+            return min(results, key=lambda x: x.get("time_ms", float("inf")))
+        elif metric == "bandwidth_gb_s":
+            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
+        else:
+            raise ValueError(f"Unknown metric: {metric}")
+
+    def benchmark_sweep(
+        self,
+        problem_sizes: List[Tuple[int, int, int]],
+        verify: bool = False,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> Dict:
+        """Run comprehensive benchmark sweep"""
+        kernels = self.discover_kernels()
+        if not kernels:
+            print("No kernels found!")
+            return {}
+
+        all_results = []
+        best_kernels = {}
+
+        for m, n, k in problem_sizes:
+            results = self.benchmark_problem_size(
+                kernels,
+                m,
+                n,                    
+                k,
+                verify=2 if verify else 0,
+                warmup=warmup,
+                repeat=repeat,
+                flush_cache=flush_cache,
+                rotating_count=rotating_count,
+            )
+
+            all_results.extend(results)
+
+            # Find best kernel for this configuration
+            best = self.find_best_kernel(results)
+            if best:
+                key = f"m{m}_n{n}_k{k}"
+                best_kernels[key] = best
+                print(
+                    f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
+                )
+
+        self.results = all_results
+        return best_kernels
+
+    def export_csv(self, filename: str):
+        """Export all results to CSV"""
+        if not self.results:
+            print("No results to export")
+            return
+
+        # Get all unique keys from results
+        all_keys = set()
+        for result in self.results:
+            all_keys.update(result.keys())
+
+        # Sort keys for consistent output
+        fieldnames = sorted(all_keys)
+
+        with open(filename, "w", newline="") as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(self.results)
+
+        print(f"Results exported to {filename}")
+
+    def export_best_kernels(self, best_kernels: Dict, filename: str):
+        """Export best kernel selections to file"""
+        with open(filename, "w") as f:
+            f.write("# Best kernel selections\n")
+            f.write(
+                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
+            )
+
+            for key, kernel in sorted(best_kernels.items()):
+                f.write(
+                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
+                )
+
+        print(f"Best kernels exported to {filename}")
+
+    def export_json(self, filename: str, best_kernels: Dict = None):
+        """Export all results and best kernels to JSON with comprehensive metadata"""
+        from datetime import datetime
+
+        # Calculate comprehensive summary statistics for all metrics
+        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
+
+        tflops_values = [r.get("tflops", 0) for r in successful_results]
+        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
+        latency_values = [
+            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
+        ]
+
+        # Performance breakdown by kernel type
+        pipeline_stats = {}
+        scheduler_stats = {}
+        data_type_stats = {}
+
+        for result in successful_results:
+            # Get config info from the new structure
+            config = result.get("config", {})
+
+            # Pipeline statistics
+            pipeline = config.get("pipeline", "unknown")
+            if pipeline not in pipeline_stats:
+                pipeline_stats[pipeline] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            pipeline_stats[pipeline]["count"] += 1
+            pipeline_stats[pipeline]["best_tflops"] = max(
+                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
+            )
+
+            # Scheduler statistics
+            scheduler = config.get("scheduler", "unknown")
+            if scheduler not in scheduler_stats:
+                scheduler_stats[scheduler] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            scheduler_stats[scheduler]["count"] += 1
+            scheduler_stats[scheduler]["best_tflops"] = max(
+                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
+            )
+
+            # Data type statistics
+            data_type = config.get("data_type", "unknown")
+            if data_type not in data_type_stats:
+                data_type_stats[data_type] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            data_type_stats[data_type]["count"] += 1
+            data_type_stats[data_type]["best_tflops"] = max(
+                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
+            )
+
+        # Calculate averages for breakdown stats
+        for stats_dict, field_name in [
+            (pipeline_stats, "pipeline"),
+            (scheduler_stats, "scheduler"),
+            (data_type_stats, "data_type"),
+        ]:
+            for key in stats_dict:
+                relevant_results = [
+                    r
+                    for r in successful_results
+                    if r.get("config", {}).get(field_name, "unknown") == key
+                ]
+                if relevant_results:
+                    stats_dict[key]["avg_tflops"] = sum(
+                        r.get("tflops", 0) for r in relevant_results
+                    ) / len(relevant_results)
+
+        output_data = {
+            "benchmark_metadata": {
+                "timestamp": datetime.now().isoformat(),
+                "total_kernels_tested": len(self.results),
+                "unique_kernels": len(
+                    set(r.get("name", "unknown") for r in self.results)
+                ),
+                "successful_runs": len(successful_results),
+                "failed_runs": len(self.results) - len(successful_results),
+            },
+            "performance_summary": {
+                "tflops_stats": {
+                    "best": max(tflops_values, default=0),
+                    "average": sum(tflops_values) / len(tflops_values)
+                    if tflops_values
+                    else 0,
+                    "min": min(tflops_values, default=0),
+                    "median": sorted(tflops_values)[len(tflops_values) // 2]
+                    if tflops_values
+                    else 0,
+                },
+                "bandwidth_stats": {
+                    "best_gb_s": max(bandwidth_values, default=0),
+                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
+                    if bandwidth_values
+                    else 0,
+                    "min_gb_s": min(bandwidth_values, default=0),
+                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
+                    if bandwidth_values
+                    else 0,
+                },
+                "latency_stats": {
+                    "best_ms": min(latency_values, default=0),
+                    "average_ms": sum(latency_values) / len(latency_values)
+                    if latency_values
+                    else 0,
+                    "max_ms": max(latency_values, default=0),
+                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
+                    if latency_values
+                    else 0,
+                },
+                "kernel_type_breakdown": {
+                    "by_pipeline": pipeline_stats,
+                    "by_scheduler": scheduler_stats,
+                    "by_data_type": data_type_stats,
+                },
+                "total_problem_configurations": len(best_kernels)
+                if best_kernels
+                else 0,
+            },
+            "kernel_results": self.results,
+            "best_kernels_by_problem": best_kernels or {},
+        }
+
+        with open(filename, "w") as f:
+            json.dump(output_data, f, indent=2)
+
+        print(f"JSON results exported to {filename}")
+        print(f"  - Total kernels: {len(self.results)}")
+        print(f"  - Successful runs: {len(successful_results)}")
+        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
+        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
+        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
+    parser.add_argument(
+        "build_dir", help="Build directory containing kernel executables"
+    )
+    parser.add_argument(
+        "--problem-sizes",
+        nargs="+",
+        default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"],
+        help="Problem sizes as M,N,K tuples",
+    )
+    parser.add_argument("--verify", action="store_true", help="Enable verification")
+    parser.add_argument(
+        "--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
+    )
+    parser.add_argument(
+        "--best", default="best_kernels.txt", help="Best kernels output filename"
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=50,
+        help="Number of warmup iterations (default: 50)",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations (default: 100)",
+    )
+    parser.add_argument(
+        "--no-flush-cache",
+        dest="flush_cache",
+        action="store_false",
+        default=True,
+        help="Disable cache flushing (default: enabled)",
+    )
+    parser.add_argument(
+        "--rotating-count",
+        type=int,
+        default=1000,
+        help="Number of iterations to rotate cache (default: 1000)",
+    )
+    parser.add_argument("--json", help="JSON output filename (optional)")
+
+    args = parser.parse_args()
+
+    # Parse problem sizes
+    problem_sizes = []
+    for size_str in args.problem_sizes:
+        try:
+            m, n, k = map(int, size_str.split(","))
+            problem_sizes.append((m, n, k))
+        except ValueError:
+            print(f"Invalid problem size: {size_str}")
+            return 1
+
+    # Create benchmark instance
+    benchmark = GemmBenchmark(args.build_dir, verbose=args.verbose)
+
+    # Run benchmark sweep
+    print("Starting GEMM kernel benchmark sweep...")
+    start_time = time.time()
+
+    best_kernels = benchmark.benchmark_sweep(
+        problem_sizes=problem_sizes,
+        verify=args.verify,
+        warmup=args.warmup,
+        repeat=args.repeat,
+        flush_cache=args.flush_cache,
+        rotating_count=args.rotating_count,
+    )
+
+    elapsed_time = time.time() - start_time
+    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
+
+    # Export results
+    benchmark.export_csv(args.csv)
+    benchmark.export_best_kernels(best_kernels, args.best)
+
+    # Export JSON if requested
+    if args.json:
+        benchmark.export_json(args.json, best_kernels)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py b/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py
index 5c87d6f50c..8fd422e6b8 100644
--- a/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py
+++ b/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py
@@ -436,19 +436,18 @@ struct SelectedKernel {{
     static constexpr ck_tile::index_t WarpTileK = {tile_config["warp_tile_k"]};
 
     // Traits
-    static constexpr bool kPadM = {"true" if pad_m == "true" else "false"};
-    static constexpr bool kPadN = {"true" if pad_n == "true" else "false"};
-    static constexpr bool kPadK = {"true" if pad_k == "true" else "false"};
+    static constexpr bool kPadM = {"true" if str(pad_m).lower() == "true" else "false"};
+    static constexpr bool kPadN = {"true" if str(pad_n).lower() == "true" else "false"};
+    static constexpr bool kPadK = {"true" if str(pad_k).lower() == "true" else "false"};
     static constexpr bool Preshuffle = false;
 
-    static constexpr bool DoubleSmemBuffer = {"true" if pipeline == "compv4" else "false"};
+    static constexpr bool DoubleSmemBuffer = {"true" if str(pipeline).lower() == "compv4" else "false"};
     static constexpr int kBlockPerCu       = 1;
     static constexpr bool StructuredSparsity = false;
     static constexpr bool NumWaveGroup       = 1;
 
     static constexpr bool TransposeC = false;
     static constexpr bool UsePersistentKernel = {"true" if str(persistent).lower() == "true" else "false"};
-    static constexpr bool UseStructuredSparsity = false;
     static constexpr ck_tile::index_t NumWaveGroups = 1;
     static constexpr ck_tile::StreamKReductionStrategy reduction_strategy = {reduction_strategy_map.get(reduction_strategy, "ck_tile::StreamKReductionStrategy::Linear")};
 
@@ -697,11 +696,11 @@ struct SelectedKernel {{
                     pipeline,
                     epilogue,
                     scheduler,
+                    reduction_strategy,
                     pad_m,
                     pad_n,
                     pad_k,
                     persistent,
-                    reduction_strategy,
                 ) = trait_combo
 
                 # Create kernel name with proper boolean capitalization
@@ -873,10 +872,10 @@ def main():
             trait_parts[1],  # epilogue
             trait_parts[2],  # scheduler
             trait_parts[3],  # reduction_strategy
-            trait_parts[4] == "false",  # pad_m
-            trait_parts[5] == "false",  # pad_n
-            trait_parts[6] == "false",  # pad_k
-            trait_parts[7],  # persistent
+            str(trait_parts[4]).lower() == "true",  # pad_m
+            str(trait_parts[5]).lower() == "true",  # pad_n
+            str(trait_parts[6]).lower() == "true",  # pad_k
+            str(trait_parts[7]).lower() == "true",  # persistent
         )
 
         # Generate the kernel
diff --git a/tile_engine/ops/pooling/CMakeLists.txt b/tile_engine/ops/pooling/CMakeLists.txt
new file mode 100644
index 0000000000..c7a47f0558
--- /dev/null
+++ b/tile_engine/ops/pooling/CMakeLists.txt
@@ -0,0 +1,213 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# ============================================================================
+# Pooling Tile Engine Build Configuration
+#
+# Generates individual benchmark executables for pooling kernels
+# ============================================================================
+
+set(POOLING_DATATYPE "fp8;fp16;fp32" CACHE STRING "List of datatypes for Pooling (semicolon-separated)")
+set(POOLING_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
+option(ENABLE_CCACHE_POOLING "Enable ccache for pooling ops compilation" OFF)
+
+# Store the directory path for use in functions
+set(POOLING_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR})
+
+# ============================================================================
+# create_individual_pool_target
+#
+# Creates a single benchmark executable for a specific pooling kernel config.
+# ============================================================================
+function(create_individual_pool_target datatype kernel_name trait tile_config config_json)
+    if(NOT POOLING_GPU_TARGETS)
+        message(WARNING "Skipping individual pooling target: No supported GPU targets")
+        return()
+    endif()
+
+    set(target_name "benchmark_pooling_${datatype}_${trait}_${tile_config}")
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}")
+    # HIP clang offload uses temporary files derived from the input source basename.
+    # When many targets compile the same source filename in parallel, temporary
+    # files can collide and corrupt each other. Use a unique copied source per target.
+    set(target_source "${CMAKE_CURRENT_BINARY_DIR}/${target_name}_pooling_benchmark_single.cpp")
+
+    # Generated header path - use kernel_name from pool_kernel_list.txt to match
+    # the filename generated by pooling_instance_builder.py
+    set(instance_header "${working_path}/pooling_single_${kernel_name}.hpp")
+
+    # Add custom command to generate the header file at build time
+    add_custom_command(
+        OUTPUT ${instance_header}
+        COMMAND ${Python3_EXECUTABLE} ${POOLING_SOURCE_DIR}/pooling_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --config_json ${config_json}
+                --gen_single
+                --kernel_name "${kernel_name}"
+                --tile_config "${tile_config}"
+                --trait_combo "${trait}"
+        DEPENDS ${POOLING_SOURCE_DIR}/pooling_instance_builder.py ${config_json}
+        COMMENT "Generating ${instance_header}"
+    )
+
+    configure_file(${POOLING_SOURCE_DIR}/pooling_benchmark_single.cpp ${target_source} COPYONLY)
+
+    # Create the executable
+    add_executable(${target_name}
+        EXCLUDE_FROM_ALL
+        ${target_source}
+        ${instance_header}
+    )
+
+    # Set GPU architectures
+    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${POOLING_GPU_TARGETS})
+
+    # Set compile definitions
+    target_compile_definitions(${target_name} PRIVATE
+        POOLING_SINGLE_INSTANCE_HPP="${instance_header}"
+    )
+
+    # Include directories
+    target_include_directories(${target_name} PRIVATE
+        ${POOLING_SOURCE_DIR}
+        ${working_path}
+    )
+
+    # Compile options
+    target_compile_options(${target_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+        -include ${instance_header}
+    )
+
+    # Add FP8 format definitions if needed
+    if(CK_USE_OCP_FP8)
+        target_compile_options(${target_name} PRIVATE -DCK_TILE_USE_OCP_FP8)
+    endif()
+
+    # Add to collection targets
+    add_dependencies(benchmark_pooling_all ${target_name})
+    add_dependencies(benchmark_pooling_${datatype} ${target_name})
+
+    message(DEBUG "  Created pooling benchmark target: ${target_name}")
+endfunction()
+
+# ============================================================================
+# build_individual_pool_targets
+#
+# Builds all benchmark targets for a specific datatype.
+# ============================================================================
+function(build_individual_pool_targets datatype)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}")
+
+    # Choose config file
+    if(DEFINED ENV{POOLING_CONFIG_FILE} AND NOT "$ENV{POOLING_CONFIG_FILE}" STREQUAL "")
+        set(config_filename "$ENV{POOLING_CONFIG_FILE}")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
+    elseif(NOT "${POOLING_CONFIG_FILE}" STREQUAL "")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${POOLING_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${POOLING_CONFIG_FILE}")
+    else()
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+        message(VERBOSE "  Using default config for pooling")
+    endif()
+
+    if(NOT EXISTS ${json_blob})
+        message(FATAL_ERROR "Config file not found: ${json_blob}")
+    endif()
+
+    file(MAKE_DIRECTORY ${working_path})
+
+    # Step 1: List kernels
+    message(VERBOSE "  Listing pooling kernel configurations for ${datatype}...")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/pooling_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --config_json ${json_blob}
+                --list_kernels
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        RESULT_VARIABLE ret
+        OUTPUT_VARIABLE list_output
+        ERROR_VARIABLE list_error
+    )
+
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list pooling kernels for ${datatype}: ${list_error}")
+    endif()
+
+    # Read kernel count
+    if(EXISTS ${working_path}/pool_kernel_count.txt)
+        file(READ ${working_path}/pool_kernel_count.txt kernel_count)
+        string(STRIP "${kernel_count}" kernel_count)
+        message(VERBOSE "  Found ${kernel_count} pooling kernel configurations")
+    else()
+        message(FATAL_ERROR "Pooling kernel count file not found")
+    endif()
+
+    # Step 2: Create targets
+    if(EXISTS ${working_path}/pool_kernel_list.txt)
+        file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines)
+        foreach(line IN LISTS kernel_lines)
+            string(REPLACE "|" ";" parts "${line}")
+            list(LENGTH parts parts_len)
+            if(parts_len EQUAL 3)
+                list(GET parts 0 kernel_name)
+                list(GET parts 1 tile_config)
+                list(GET parts 2 trait_combo)
+                create_individual_pool_target("${datatype}" "${kernel_name}" "${trait_combo}" "${tile_config}" "${json_blob}")
+            endif()
+        endforeach()
+    else()
+        message(FATAL_ERROR "Pooling kernel list file not found")
+    endif()
+endfunction()
+
+# ============================================================================
+# MAIN EXECUTION
+# ============================================================================
+
+message(VERBOSE "=== Starting Tile Engine Pooling Configuration ===")
+message(VERBOSE "POOLING_DATATYPE: ${POOLING_DATATYPE}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+
+# Filter GPU targets
+set(POOLING_GPU_TARGETS "")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+
+foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+    if(target IN_LIST DESIRED_TARGETS)
+        list(APPEND POOLING_GPU_TARGETS ${target})
+        message(VERBOSE "  Adding GPU target for pooling: ${target}")
+    endif()
+endforeach()
+
+if(NOT POOLING_GPU_TARGETS)
+    message(WARNING "Skipping Tile Engine Pooling build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+else()
+    message(VERBOSE "Building pooling targets for GPU targets: ${POOLING_GPU_TARGETS}")
+
+    # Enable ccache if requested
+    if(ENABLE_CCACHE_POOLING)
+        find_program(CCACHE_PROGRAM ccache)
+        if(CCACHE_PROGRAM)
+            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+            message(VERBOSE "Using ccache for pooling compilation")
+        endif()
+    endif()
+
+    # Create collection targets
+    add_custom_target(benchmark_pooling_all)
+
+    foreach(dt IN LISTS POOLING_DATATYPE)
+        add_custom_target(benchmark_pooling_${dt})
+    endforeach()
+
+    # Build targets for each datatype
+    foreach(dt IN LISTS POOLING_DATATYPE)
+        build_individual_pool_targets(${dt})
+    endforeach()
+endif()
diff --git a/tile_engine/ops/pooling/configs/default_config.json b/tile_engine/ops/pooling/configs/default_config.json
new file mode 100644
index 0000000000..0104dbd9f7
--- /dev/null
+++ b/tile_engine/ops/pooling/configs/default_config.json
@@ -0,0 +1,21 @@
+{
+    "problem": {
+        "description": "Default pooling configuration for tile_engine benchmarks"
+    },
+    "tile_config": {
+        "block_m": {"values": [64,128,256]},
+        "block_n": {"values": [1,2]},
+        "warp_m": {"values": [1]},
+        "warp_n": {"values": [1]},
+        "warp_tile_m": {"values": [128]},
+        "warp_tile_n": {"values": [1]},
+        "thread_tile_m": {"values": [1,2,4]},
+        "thread_tile_n": {"values": [1]}
+    },
+    "trait_config": {
+        "reduce_op": {"values": ["max", "min", "avg"]},
+        "output_index": {"values": [true, false]},
+        "propagate_nan": {"values": [true, false]},
+        "pooling_dim": {"values": ["2d", "3d"]}
+    }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/pooling/pooling_benchmark.hpp b/tile_engine/ops/pooling/pooling_benchmark.hpp
new file mode 100644
index 0000000000..09073cdbd1
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_benchmark.hpp
@@ -0,0 +1,132 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <cmath>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/host/reference/reference_pool.hpp"
+
+namespace ck_tile {
+
+/// @brief Performance metrics for benchmarking
+enum class PoolMetric
+{
+    LATENCY,
+    BANDWIDTH
+};
+
+/// @brief Pooling problem specification for 2D pooling
+struct PoolProblem2D
+{
+    index_t N, H, W, C;              // Input dimensions (NHWC)
+    index_t Y, X;                    // Window dimensions
+    index_t stride_h, stride_w;      // Window strides
+    index_t dilation_h, dilation_w;  // Window dilations
+    index_t pad_h_left, pad_h_right; // Height padding
+    index_t pad_w_left, pad_w_right; // Width padding
+    std::string datatype;            // Data type name
+    std::string reduce_op;           // "max", "min", or "avg"
+
+    index_t Ho() const
+    {
+        index_t Ys = (Y - 1) * dilation_h + 1;
+        return (H + pad_h_left + pad_h_right - Ys) / stride_h + 1;
+    }
+
+    index_t Wo() const
+    {
+        index_t Xs = (X - 1) * dilation_w + 1;
+        return (W + pad_w_left + pad_w_right - Xs) / stride_w + 1;
+    }
+
+    index_t input_elements() const { return N * H * W * C; }
+    index_t output_elements() const { return N * Ho() * Wo() * C; }
+
+    std::string to_string() const
+    {
+        std::ostringstream oss;
+        oss << "N" << N << "_H" << H << "_W" << W << "_C" << C << "_Y" << Y << "_X" << X << "_Sh"
+            << stride_h << "_Sw" << stride_w << "_Dh" << dilation_h << "_Dw" << dilation_w;
+        if(pad_h_left > 0 || pad_w_left > 0)
+            oss << "_Ph" << pad_h_left << "_Pw" << pad_w_left;
+        return oss.str();
+    }
+};
+
+/// @brief Pooling problem specification for 3D pooling
+struct PoolProblem3D
+{
+    index_t N, D, H, W, C;                      // Input dimensions (NDHWC)
+    index_t Z, Y, X;                            // Window dimensions
+    index_t stride_d, stride_h, stride_w;       // Window strides
+    index_t dilation_d, dilation_h, dilation_w; // Window dilations
+    index_t pad_d_left, pad_d_right;            // Depth padding
+    index_t pad_h_left, pad_h_right;            // Height padding
+    index_t pad_w_left, pad_w_right;            // Width padding
+    std::string datatype;                       // Data type name
+    std::string reduce_op;                      // "max", "min", or "avg"
+
+    index_t Do() const
+    {
+        index_t Zs = (Z - 1) * dilation_d + 1;
+        return (D + pad_d_left + pad_d_right - Zs) / stride_d + 1;
+    }
+
+    index_t Ho() const
+    {
+        index_t Ys = (Y - 1) * dilation_h + 1;
+        return (H + pad_h_left + pad_h_right - Ys) / stride_h + 1;
+    }
+
+    index_t Wo() const
+    {
+        index_t Xs = (X - 1) * dilation_w + 1;
+        return (W + pad_w_left + pad_w_right - Xs) / stride_w + 1;
+    }
+
+    index_t input_elements() const { return N * D * H * W * C; }
+    index_t output_elements() const { return N * Do() * Ho() * Wo() * C; }
+
+    std::string to_string() const
+    {
+        std::ostringstream oss;
+        oss << "N" << N << "_D" << D << "_H" << H << "_W" << W << "_C" << C << "_Z" << Z << "_Y"
+            << Y << "_X" << X;
+        return oss.str();
+    }
+};
+
+/// @brief Performance result for a pooling kernel
+struct PoolPerformanceResult
+{
+    float latency_ms;
+    float bandwidth_gb_s;
+
+    std::string to_string() const
+    {
+        std::ostringstream oss;
+        oss << "latency=" << latency_ms << "ms, bandwidth=" << bandwidth_gb_s << "GB/s";
+        return oss.str();
+    }
+};
+
+/// @brief Benchmark settings
+struct PoolBenchmarkSetting
+{
+    int warmup      = 5;
+    int repeat      = 20;
+    bool verify     = true;
+    int init_method = 0; // 0: uniform random, 1: integer sequence, 2: constant, 3: special
+};
+
+} // namespace ck_tile
diff --git a/tile_engine/ops/pooling/pooling_benchmark_single.cpp b/tile_engine/ops/pooling/pooling_benchmark_single.cpp
new file mode 100644
index 0000000000..0d872a9f51
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_benchmark_single.cpp
@@ -0,0 +1,390 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file pooling_benchmark_single.cpp
+ * @brief Single-kernel benchmark for pooling operations (2D and 3D).
+ *
+ * This benchmark includes the generated kernel header via -include flag
+ * and runs the pooling kernel with specified problem sizes.
+ *
+ * The generated header provides:
+ *   - SelectedKernel   (struct with ::launch())
+ *   - KERNEL_NAME      (constexpr const char*)
+ *   - POOLING_DIM      (constexpr int, 2 or 3)
+ *   - InDataType, OutDataType, ComputeDataType, IndexDataType, ReduceOpType
+ *   - TensorShape, WindowShape
+ */
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/host/reference/reference_pool.hpp"
+#include "pooling_common.hpp"
+#include "pooling_benchmark.hpp"
+
+// The kernel header is included via compile command line with -include flag.
+
+// --------------------------------------------------------------------------
+// Benchmark implementation — templated on pooling dimension so that only
+// the matching branch is instantiated (2D or 3D).
+// --------------------------------------------------------------------------
+
+template <typename HostArgs>
+static float launch_selected_kernel(HostArgs& args, const ck_tile::stream_config& stream)
+{
+    return SelectedKernel::launch(args, stream);
+}
+
+template <int PoolDim>
+static int benchmark_pooling(int argc, char* argv[])
+{
+    if constexpr(PoolDim == 2)
+    {
+        // ---- 2D argument parser ----
+        ck_tile::ArgParser arg_parser;
+        arg_parser.insert("n", "1", "Batch size (N)")
+            .insert("h", "16", "Input height (H)")
+            .insert("w", "16", "Input width (W)")
+            .insert("c", "32", "Channels (C)")
+            .insert("wy", "2", "Window height (Y)")
+            .insert("wx", "2", "Window width (X)")
+            .insert("sy", "2", "Window stride height")
+            .insert("sx", "2", "Window stride width")
+            .insert("dy", "1", "Window dilation height")
+            .insert("dx", "1", "Window dilation width")
+            .insert("phy", "0", "Padding height left")
+            .insert("phyr", "0", "Padding height right")
+            .insert("pwx", "0", "Padding width left")
+            .insert("pwxr", "0", "Padding width right")
+            .insert("verify", "1", "Verify results (0/1)")
+            .insert("warmup", "5", "Warmup iterations")
+            .insert("repeat", "20", "Repeat iterations")
+            .insert("log", "1", "Log level");
+
+        if(!arg_parser.parse(argc, argv))
+            return -1;
+
+        ck_tile::index_t N       = arg_parser.get_int("n");
+        ck_tile::index_t H       = arg_parser.get_int("h");
+        ck_tile::index_t W       = arg_parser.get_int("w");
+        ck_tile::index_t C       = arg_parser.get_int("c");
+        ck_tile::index_t Y       = arg_parser.get_int("wy");
+        ck_tile::index_t X       = arg_parser.get_int("wx");
+        ck_tile::index_t Sy      = arg_parser.get_int("sy");
+        ck_tile::index_t Sx      = arg_parser.get_int("sx");
+        ck_tile::index_t Dy      = arg_parser.get_int("dy");
+        ck_tile::index_t Dx      = arg_parser.get_int("dx");
+        ck_tile::index_t LeftPy  = arg_parser.get_int("phy");
+        ck_tile::index_t RightPy = arg_parser.get_int("phyr");
+        ck_tile::index_t LeftPx  = arg_parser.get_int("pwx");
+        ck_tile::index_t RightPx = arg_parser.get_int("pwxr");
+
+        bool verify   = arg_parser.get_int("verify") != 0;
+        int warmup    = arg_parser.get_int("warmup");
+        int repeat    = arg_parser.get_int("repeat");
+        int log_level = arg_parser.get_int("log");
+
+        ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        ck_tile::index_t Xs = (X - 1) * Dx + 1;
+        ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
+        ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
+
+        std::cout << "Pooling 2D benchmark: " << KERNEL_NAME << std::endl;
+        std::cout << "  Input:  NHWC = " << N << "x" << H << "x" << W << "x" << C << std::endl;
+        std::cout << "  Output: NHWC = " << N << "x" << Ho << "x" << Wo << "x" << C << std::endl;
+        std::cout << "  Window: " << Y << "x" << X << ", stride: " << Sy << "x" << Sx
+                  << ", dilation: " << Dy << "x" << Dx << std::endl;
+
+        ck_tile::HostTensor<InDataType> h_in({N, H, W, C});
+        ck_tile::HostTensor<OutDataType> h_out({N, Ho, Wo, C});
+        ck_tile::HostTensor<OutDataType> h_out_ref({N, Ho, Wo, C});
+        ck_tile::HostTensor<IndexDataType> h_out_index({N, Ho, Wo, C});
+        ck_tile::HostTensor<IndexDataType> h_out_ref_index({N, Ho, Wo, C});
+
+        ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(h_in);
+
+        ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes());
+
+        d_in.ToDevice(h_in.data());
+        d_out.SetZero();
+        d_out_index.SetZero();
+
+        auto input_shape      = ck_tile::make_tuple(N, H, W, C);
+        auto output_shape     = ck_tile::make_tuple(N, Ho, Wo, C);
+        auto input_strides    = ck_tile::make_tuple(H * W * C, W * C, C, ck_tile::index_t{1});
+        auto output_strides   = ck_tile::make_tuple(Ho * Wo * C, Wo * C, C, ck_tile::index_t{1});
+        auto window_lengths   = ck_tile::make_tuple(Y, X);
+        auto window_strides   = ck_tile::make_tuple(Sy, Sx);
+        auto window_dilations = ck_tile::make_tuple(Dy, Dx);
+        auto input_left_pads  = ck_tile::make_tuple(LeftPy, LeftPx);
+        auto input_right_pads = ck_tile::make_tuple(RightPy, RightPx);
+
+        auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+            d_in.GetDeviceBuffer(),
+            d_out.GetDeviceBuffer(),
+            d_out_index.GetDeviceBuffer(),
+            input_shape,
+            output_shape,
+            input_strides,
+            output_strides,
+            window_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads};
+
+        ck_tile::stream_config stream{nullptr, true, log_level, warmup, repeat};
+
+        float latency = 0;
+        try
+        {
+            latency = launch_selected_kernel(host_args, stream);
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << "Kernel launch failed: " << e.what() << std::endl;
+            return -1;
+        }
+
+        size_t bytes_read    = static_cast<size_t>(N) * H * W * C * sizeof(InDataType);
+        size_t bytes_written = static_cast<size_t>(N) * Ho * Wo * C * sizeof(OutDataType);
+        float bandwidth      = (bytes_read + bytes_written) / (latency * 1e-3f) / 1e9f;
+
+        std::cout << "  Latency: " << latency << " ms" << std::endl;
+        std::cout << "  Bandwidth: " << bandwidth << " GB/s" << std::endl;
+
+        if(verify)
+        {
+            d_out.FromDevice(h_out.data());
+            d_out_index.FromDevice(h_out_index.data());
+
+            auto kernel_args =
+                ck_tile::PoolKernelArgs<decltype(input_shape), decltype(window_lengths)>{
+                    h_in.data(),
+                    h_out_ref.data(),
+                    h_out_ref_index.data(),
+                    input_shape,
+                    output_shape,
+                    input_strides,
+                    output_strides,
+                    window_lengths,
+                    window_strides,
+                    window_dilations,
+                    input_left_pads,
+                    input_right_pads};
+
+            ck_tile::reference_pool2d<InDataType,
+                                      ComputeDataType,
+                                      OutDataType,
+                                      IndexDataType,
+                                      ReduceOpType,
+                                      decltype(input_shape),
+                                      decltype(window_lengths),
+                                      SelectedKernel::kOutputIndex>(
+                h_in, h_out_ref, h_out_ref_index, kernel_args, ReduceOpType{});
+
+            bool pass_value =
+                ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-3, 1e-3);
+            std::cout << "  Verification: " << (pass_value ? "PASS" : "FAIL") << std::endl;
+
+            if(SelectedKernel::kOutputIndex)
+            {
+                bool pass_index = ck_tile::check_err(
+                    h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0);
+                std::cout << "  Index verification: " << (pass_index ? "PASS" : "FAIL")
+                          << std::endl;
+            }
+        }
+
+        return 0;
+    }
+    else // PoolDim == 3
+    {
+        // ---- 3D argument parser ----
+        ck_tile::ArgParser arg_parser;
+        arg_parser.insert("n", "1", "Batch size (N)")
+            .insert("d", "4", "Input depth (D)")
+            .insert("h", "16", "Input height (H)")
+            .insert("w", "16", "Input width (W)")
+            .insert("c", "32", "Channels (C)")
+            .insert("wz", "2", "Window depth (Z)")
+            .insert("wy", "2", "Window height (Y)")
+            .insert("wx", "2", "Window width (X)")
+            .insert("sz", "2", "Window stride depth")
+            .insert("sy", "2", "Window stride height")
+            .insert("sx", "2", "Window stride width")
+            .insert("dz", "1", "Window dilation depth")
+            .insert("dy", "1", "Window dilation height")
+            .insert("dx", "1", "Window dilation width")
+            .insert("pdz", "0", "Padding depth left")
+            .insert("pdzr", "0", "Padding depth right")
+            .insert("phy", "0", "Padding height left")
+            .insert("phyr", "0", "Padding height right")
+            .insert("pwx", "0", "Padding width left")
+            .insert("pwxr", "0", "Padding width right")
+            .insert("verify", "1", "Verify results (0/1)")
+            .insert("warmup", "5", "Warmup iterations")
+            .insert("repeat", "20", "Repeat iterations")
+            .insert("log", "1", "Log level");
+
+        if(!arg_parser.parse(argc, argv))
+            return -1;
+
+        ck_tile::index_t N       = arg_parser.get_int("n");
+        ck_tile::index_t D       = arg_parser.get_int("d");
+        ck_tile::index_t H       = arg_parser.get_int("h");
+        ck_tile::index_t W       = arg_parser.get_int("w");
+        ck_tile::index_t C       = arg_parser.get_int("c");
+        ck_tile::index_t Z       = arg_parser.get_int("wz");
+        ck_tile::index_t Y       = arg_parser.get_int("wy");
+        ck_tile::index_t X       = arg_parser.get_int("wx");
+        ck_tile::index_t Sz      = arg_parser.get_int("sz");
+        ck_tile::index_t Sy      = arg_parser.get_int("sy");
+        ck_tile::index_t Sx      = arg_parser.get_int("sx");
+        ck_tile::index_t Dz      = arg_parser.get_int("dz");
+        ck_tile::index_t Dy      = arg_parser.get_int("dy");
+        ck_tile::index_t Dx      = arg_parser.get_int("dx");
+        ck_tile::index_t LeftPz  = arg_parser.get_int("pdz");
+        ck_tile::index_t RightPz = arg_parser.get_int("pdzr");
+        ck_tile::index_t LeftPy  = arg_parser.get_int("phy");
+        ck_tile::index_t RightPy = arg_parser.get_int("phyr");
+        ck_tile::index_t LeftPx  = arg_parser.get_int("pwx");
+        ck_tile::index_t RightPx = arg_parser.get_int("pwxr");
+
+        bool verify   = arg_parser.get_int("verify") != 0;
+        int warmup    = arg_parser.get_int("warmup");
+        int repeat    = arg_parser.get_int("repeat");
+        int log_level = arg_parser.get_int("log");
+
+        ck_tile::index_t Zs = (Z - 1) * Dz + 1;
+        ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        ck_tile::index_t Xs = (X - 1) * Dx + 1;
+        ck_tile::index_t Do = (D + LeftPz + RightPz - Zs) / Sz + 1;
+        ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
+        ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
+
+        std::cout << "Pooling 3D benchmark: " << KERNEL_NAME << std::endl;
+        std::cout << "  Input:  NDHWC = " << N << "x" << D << "x" << H << "x" << W << "x" << C
+                  << std::endl;
+        std::cout << "  Output: NDHWC = " << N << "x" << Do << "x" << Ho << "x" << Wo << "x" << C
+                  << std::endl;
+        std::cout << "  Window: " << Z << "x" << Y << "x" << X << ", stride: " << Sz << "x" << Sy
+                  << "x" << Sx << ", dilation: " << Dz << "x" << Dy << "x" << Dx << std::endl;
+
+        ck_tile::HostTensor<InDataType> h_in({N, D, H, W, C});
+        ck_tile::HostTensor<OutDataType> h_out({N, Do, Ho, Wo, C});
+        ck_tile::HostTensor<OutDataType> h_out_ref({N, Do, Ho, Wo, C});
+        ck_tile::HostTensor<IndexDataType> h_out_index({N, Do, Ho, Wo, C});
+        ck_tile::HostTensor<IndexDataType> h_out_ref_index({N, Do, Ho, Wo, C});
+
+        ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(h_in);
+
+        ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes());
+
+        d_in.ToDevice(h_in.data());
+        d_out.SetZero();
+        d_out_index.SetZero();
+
+        auto input_shape  = ck_tile::make_tuple(N, D, H, W, C);
+        auto output_shape = ck_tile::make_tuple(N, Do, Ho, Wo, C);
+        auto input_strides =
+            ck_tile::make_tuple(D * H * W * C, H * W * C, W * C, C, ck_tile::index_t{1});
+        auto output_strides =
+            ck_tile::make_tuple(Do * Ho * Wo * C, Ho * Wo * C, Wo * C, C, ck_tile::index_t{1});
+        auto window_lengths   = ck_tile::make_tuple(Z, Y, X);
+        auto window_strides   = ck_tile::make_tuple(Sz, Sy, Sx);
+        auto window_dilations = ck_tile::make_tuple(Dz, Dy, Dx);
+        auto input_left_pads  = ck_tile::make_tuple(LeftPz, LeftPy, LeftPx);
+        auto input_right_pads = ck_tile::make_tuple(RightPz, RightPy, RightPx);
+
+        auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+            d_in.GetDeviceBuffer(),
+            d_out.GetDeviceBuffer(),
+            d_out_index.GetDeviceBuffer(),
+            input_shape,
+            output_shape,
+            input_strides,
+            output_strides,
+            window_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads};
+
+        ck_tile::stream_config stream{nullptr, true, log_level, warmup, repeat};
+
+        float latency = 0;
+        try
+        {
+            latency = launch_selected_kernel(host_args, stream);
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << "Kernel launch failed: " << e.what() << std::endl;
+            return -1;
+        }
+
+        size_t bytes_read    = static_cast<size_t>(N) * D * H * W * C * sizeof(InDataType);
+        size_t bytes_written = static_cast<size_t>(N) * Do * Ho * Wo * C * sizeof(OutDataType);
+        float bandwidth      = (bytes_read + bytes_written) / (latency * 1e-3f) / 1e9f;
+
+        std::cout << "  Latency: " << latency << " ms" << std::endl;
+        std::cout << "  Bandwidth: " << bandwidth << " GB/s" << std::endl;
+
+        if(verify)
+        {
+            d_out.FromDevice(h_out.data());
+            d_out_index.FromDevice(h_out_index.data());
+
+            auto kernel_args =
+                ck_tile::PoolKernelArgs<decltype(input_shape), decltype(window_lengths)>{
+                    h_in.data(),
+                    h_out_ref.data(),
+                    h_out_ref_index.data(),
+                    input_shape,
+                    output_shape,
+                    input_strides,
+                    output_strides,
+                    window_lengths,
+                    window_strides,
+                    window_dilations,
+                    input_left_pads,
+                    input_right_pads};
+
+            ck_tile::reference_pool3d<InDataType,
+                                      ComputeDataType,
+                                      OutDataType,
+                                      IndexDataType,
+                                      ReduceOpType,
+                                      decltype(input_shape),
+                                      decltype(window_lengths),
+                                      SelectedKernel::kOutputIndex>(
+                h_in, h_out_ref, h_out_ref_index, kernel_args, ReduceOpType{});
+
+            bool pass_value =
+                ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-3, 1e-3);
+            std::cout << "  Verification: " << (pass_value ? "PASS" : "FAIL") << std::endl;
+
+            if(SelectedKernel::kOutputIndex)
+            {
+                bool pass_index = ck_tile::check_err(
+                    h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0);
+                std::cout << "  Index verification: " << (pass_index ? "PASS" : "FAIL")
+                          << std::endl;
+            }
+        }
+
+        return 0;
+    }
+}
+
+int main(int argc, char* argv[]) { return benchmark_pooling<POOLING_DIM>(argc, argv); }
diff --git a/tile_engine/ops/pooling/pooling_common.hpp b/tile_engine/ops/pooling/pooling_common.hpp
new file mode 100644
index 0000000000..313fbac332
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_common.hpp
@@ -0,0 +1,52 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <iostream>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/pooling.hpp"
+
+namespace ck_tile {
+
+/// @brief Kernel trait parameters for pooling tile_engine configurations
+struct PoolingKernelTraits
+{
+    std::string reduce_op; // "max", "min", or "avg"
+    bool output_index;     // Whether to output indices (max pooling)
+    bool propagate_nan;    // Whether to propagate NaN values
+    bool cross_warp;       // Whether cross-warp reduction is used
+
+    std::string to_string() const
+    {
+        std::ostringstream oss;
+        oss << reduce_op << "_" << (output_index ? "idx" : "noidx") << "_"
+            << (propagate_nan ? "nan" : "nonan") << "_"
+            << (cross_warp ? "crosswarp" : "nocrosswarp");
+        return oss.str();
+    }
+};
+
+/// @brief Extract traits from a kernel name string
+inline PoolingKernelTraits extract_pooling_traits_from_name(const std::string& name)
+{
+    PoolingKernelTraits traits;
+    if(name.find("max") != std::string::npos)
+        traits.reduce_op = "max";
+    else if(name.find("min") != std::string::npos)
+        traits.reduce_op = "min";
+    else
+        traits.reduce_op = "avg";
+    traits.output_index =
+        (name.find("idx") != std::string::npos) && (name.find("noidx") == std::string::npos);
+    traits.propagate_nan =
+        (name.find("nan") != std::string::npos) && (name.find("nonan") == std::string::npos);
+    traits.cross_warp = (name.find("crosswarp") != std::string::npos) &&
+                        (name.find("nocrosswarp") == std::string::npos);
+    return traits;
+}
+
+} // namespace ck_tile
diff --git a/tile_engine/ops/pooling/pooling_instance_builder.py b/tile_engine/ops/pooling/pooling_instance_builder.py
new file mode 100644
index 0000000000..0495ee3348
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_instance_builder.py
@@ -0,0 +1,551 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Pooling kernel instance builder for tile_engine.
+
+Generates C++ kernel headers for pooling operations with specific tile
+configurations and trait combinations.
+
+Usage:
+    --list_kernels: List valid kernel configurations
+    --gen_single:   Generate a single kernel header
+    --gen_individual: Generate all kernel headers
+"""
+
+import os
+import json
+import argparse
+import itertools
+import multiprocessing
+import concurrent.futures
+from pathlib import Path
+import logging
+
+from pooling_validation_utils import (
+    is_tile_config_valid,
+    is_trait_combination_valid,
+    get_dtype_string,
+    get_reduce_op_string,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PoolingKernelBuilder:
+    def __init__(self, working_path, datatype, config_json=None):
+        self.working_path = Path(working_path)
+        self.datatype = datatype
+        self.config_json = config_json
+
+        # Create working directory if it doesn't exist
+        self.working_path.mkdir(parents=True, exist_ok=True)
+
+        # Load configuration
+        if config_json and os.path.exists(config_json):
+            with open(config_json, "r") as f:
+                self.config = json.load(f)
+        else:
+            self.config = self._get_default_config()
+
+    def _get_default_config(self):
+        """Return default configuration if no config file is provided"""
+        return {
+            "tile_config": {
+                "block_m": {"values": [64,128,256]},
+                "block_n": {"values": [1,2]},
+                "warp_m": {"values": [1]},
+                "warp_n": {"values": [1]},
+                "warp_tile_m": {"values": [128]},
+                "warp_tile_n": {"values": [1]},
+                "thread_tile_m": {"values": [1,2,4]},
+                "thread_tile_n": {"values": [1]},
+            },
+            "trait_config": {
+                "reduce_op": {"values": ["max", "min", "avg"]},
+                "output_index": {"values": [True, False]},
+                "propagate_nan": {"values": [True, False]},
+                "pooling_dim": {"values": ["2d", "3d"]},
+            },
+        }
+
+    def _get_tile_configs(self, fast_mode=False):
+        """Get tile configurations from config"""
+        if "tile_config" not in self.config:
+            return []
+
+        tile_config = self.config["tile_config"]
+
+        block_m_values = tile_config.get("block_m", {}).get("values", [64,128,256])
+        block_n_values = tile_config.get("block_n", {}).get("values", [1,2])
+        warp_m_values = tile_config.get("warp_m", {}).get("values", [1])
+        warp_n_values = tile_config.get("warp_n", {}).get("values", [1])
+        warp_tile_m_values = tile_config.get("warp_tile_m", {}).get("values", [128])
+        warp_tile_n_values = tile_config.get("warp_tile_n", {}).get("values", [1])
+        thread_tile_m_values = tile_config.get("thread_tile_m", {}).get("values", [1,2,4])
+        thread_tile_n_values = tile_config.get("thread_tile_n", {}).get("values", [1])
+
+        configs = []
+        for block_m in block_m_values:
+            for block_n in block_n_values:
+                for warp_m in warp_m_values:
+                    for warp_n in warp_n_values:
+                        for warp_tile_m in warp_tile_m_values:
+                            for warp_tile_n in warp_tile_n_values:
+                                for thread_tile_m in thread_tile_m_values:
+                                    for thread_tile_n in thread_tile_n_values:
+                                        if self._validate_tile_config(
+                                            block_m,
+                                            block_n,
+                                            warp_m,
+                                            warp_n,
+                                            warp_tile_m,
+                                            warp_tile_n,
+                                            thread_tile_m,
+                                            thread_tile_n,
+                                            fast_mode=fast_mode,
+                                        ):
+                                            configs.append(
+                                                {
+                                                    "block_m": block_m,
+                                                    "block_n": block_n,
+                                                    "warp_m": warp_m,
+                                                    "warp_n": warp_n,
+                                                    "warp_tile_m": warp_tile_m,
+                                                    "warp_tile_n": warp_tile_n,
+                                                    "thread_tile_m": thread_tile_m,
+                                                    "thread_tile_n": thread_tile_n,
+                                                }
+                                            )
+        return configs
+
+    def _validate_tile_config(
+        self,
+        block_m,
+        block_n,
+        warp_m,
+        warp_n,
+        warp_tile_m,
+        warp_tile_n,
+        thread_tile_m,
+        thread_tile_n,
+        fast_mode=False,
+    ):
+        """Validate tile configuration via pooling_validation_utils."""
+        return is_tile_config_valid(
+            block_m,
+            block_n,
+            warp_m,
+            warp_n,
+            warp_tile_m,
+            warp_tile_n,
+            thread_tile_m,
+            thread_tile_n,
+            self.datatype,
+            self.datatype,
+            fast_mode=fast_mode,
+        )
+
+    def _generate_trait_combinations(self):
+        """Generate all combinations of traits"""
+        if "trait_config" not in self.config:
+            return [("max", True, False, "2d")]
+
+        trait_config = self.config["trait_config"]
+
+        reduce_ops = trait_config.get("reduce_op", {}).get("values", ["min","max","avg"])
+        output_indices = trait_config.get("output_index", {}).get("values", [True, False])
+        propagate_nans = trait_config.get("propagate_nan", {}).get("values", [True, False])
+        pooling_dims = trait_config.get("pooling_dim", {}).get("values", ["2d", "3d"])
+
+        all_combinations = list(
+            itertools.product(reduce_ops, output_indices, propagate_nans, pooling_dims)
+        )
+
+        # Filter valid combinations
+        combinations = []
+        for combo in all_combinations:
+            reduce_op, output_index, propagate_nan, pooling_dim = combo
+            if is_trait_combination_valid(
+                reduce_op, output_index, propagate_nan, pooling_dim
+            ):
+                combinations.append(combo)
+            else:
+                logger.debug(
+                    f"Skipping unsupported trait combination: {reduce_op}-{output_index}-{propagate_nan}-{pooling_dim}"
+                )
+
+        return combinations
+
+    def _get_dtype_string(self):
+        """Get C++ type string for datatype."""
+        return get_dtype_string(self.datatype)
+
+    def _get_reduce_op_string(self, reduce_op):
+        """Get C++ reduce op type string."""
+        return get_reduce_op_string(reduce_op)
+
+    def _generate_kernel_instance(self, tile_config, trait_combo, is_header=True):
+        """Generate a single kernel instance header"""
+        reduce_op, output_index, propagate_nan, pooling_dim = trait_combo
+
+        # Create kernel name
+        kernel_name = (
+            f"pool_{self.datatype}_{pooling_dim}_{reduce_op}_"
+            f"{'idx' if output_index else 'noidx'}_"
+            f"{'nan' if propagate_nan else 'nonan'}"
+        )
+
+        # Create tile configuration string
+        tile_str = (
+            f"{tile_config['block_m']}x{tile_config['block_n']}_"
+            f"{tile_config['warp_m']}x{tile_config['warp_n']}_"
+            f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}_"
+            f"{tile_config['thread_tile_m']}x{tile_config['thread_tile_n']}"
+        )
+
+        kernel_name += f"_{tile_str}"
+
+        # Determine types
+        in_type = self._get_dtype_string()
+        out_type = in_type
+        compute_type = "float"  # Always use float for computation
+        index_type = "ck_tile::index_t"
+        reduce_op_type = self._get_reduce_op_string(reduce_op)
+
+        output_index_str = "true" if output_index else "false"
+        propagate_nan_str = "true" if propagate_nan else "false"
+
+        # Generate 2D or 3D specific code
+        if pooling_dim == "2d":
+            tensor_shape_type = "ck_tile::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t, ck_tile::index_t>"
+            window_shape_type = "ck_tile::tuple<ck_tile::index_t, ck_tile::index_t>"
+            window_rank = 2
+        else:
+            tensor_shape_type = "ck_tile::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t, ck_tile::index_t, ck_tile::index_t>"
+            window_shape_type = (
+                "ck_tile::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t>"
+            )
+            window_rank = 3
+
+        pragma_line = "#pragma once\n" if is_header else ""
+        instance_code = f"""// Generated kernel instance for {kernel_name}
+{pragma_line}
+#include <cstdint>
+#include <utility>
+#include <tuple>
+#include <iostream>
+#include <stdexcept>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/pooling.hpp"
+
+using InDataType = {in_type};
+using OutDataType = {out_type};
+using ComputeDataType = {compute_type};
+using IndexDataType = {index_type};
+using ReduceOpType = {reduce_op_type};
+
+using TensorShape = {tensor_shape_type};
+using WindowShape = {window_shape_type};
+
+// Kernel name for display
+constexpr const char* KERNEL_NAME = "{kernel_name}";
+constexpr int POOLING_DIM = {window_rank};
+
+// Wrapper for simplified launch interface
+struct SelectedKernel {{
+    // Tile configuration - PoolShape parameters
+    static constexpr ck_tile::index_t Block_M = {tile_config["block_m"]};
+    static constexpr ck_tile::index_t Block_N = {tile_config["block_n"]};
+    static constexpr ck_tile::index_t WarpPerBlock_M = {tile_config["warp_m"]};
+    static constexpr ck_tile::index_t WarpPerBlock_N = {tile_config["warp_n"]};
+    static constexpr ck_tile::index_t WarpTile_M = {tile_config["warp_tile_m"]};
+    static constexpr ck_tile::index_t WarpTile_N = {tile_config["warp_tile_n"]};
+    static constexpr ck_tile::index_t ThreadTile_M = {tile_config["thread_tile_m"]};
+    static constexpr ck_tile::index_t ThreadTile_N = {tile_config["thread_tile_n"]};
+
+    // Traits
+    static constexpr bool kOutputIndex = {output_index_str};
+    static constexpr bool kPropagateNan = {propagate_nan_str};
+
+    // Pool shape
+    using BlockWarps = ck_tile::sequence<WarpPerBlock_M, WarpPerBlock_N>;
+    using BlockTile = ck_tile::sequence<Block_M, Block_N>;
+    using WarpTile = ck_tile::sequence<WarpTile_M, WarpTile_N>;
+    using ThreadTile = ck_tile::sequence<ThreadTile_M, ThreadTile_N>;
+
+    using PoolShapeType = ck_tile::PoolShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+
+    // Problem and kernel types
+    using Problem = ck_tile::PoolProblem<InDataType,
+                                         OutDataType,
+                                         ComputeDataType,
+                                         IndexDataType,
+                                         ReduceOpType,
+                                         kOutputIndex,
+                                         kPropagateNan,
+                                         PoolShapeType>;
+    using Kernel = ck_tile::PoolKernel<Problem>;
+
+    static float launch(ck_tile::PoolHostArgs<TensorShape, WindowShape>& args,
+                        const ck_tile::stream_config& stream) {{
+
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+        const ck_tile::index_t kBlockSize = Kernel::BlockSize();
+
+        auto kernel_args = Kernel::MakeKernelArgs(args);
+
+        if (!Kernel::IsSupportedArgument(kernel_args)) {{
+            throw std::runtime_error(
+                std::string("Unsupported arguments for pooling kernel: ") + KERNEL_NAME);
+        }}
+
+        const ck_tile::index_t kGridSize = Kernel::CalculateGridSize(kernel_args);
+
+        if(stream.log_level_ > 0) {{
+            std::cout << "Launching pooling kernel: " << KERNEL_NAME << "\\n"
+                      << "  grid_size: " << kGridSize << ", block_size: " << kBlockSize
+                      << std::endl;
+        }}
+
+        return ck_tile::launch_kernel(
+            stream,
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{{}}, kGridSize, kBlockSize, 0, kernel_args));
+    }}
+}};
+"""
+        return kernel_name, instance_code
+
+    def write_kernel_list(self):
+        """Write kernel list to file for CMake to read"""
+        tile_configs = self._get_tile_configs(fast_mode=False)
+        trait_combos = self._generate_trait_combinations()
+
+        kernel_list = []
+        for tile_config in tile_configs:
+            for trait_combo in trait_combos:
+                reduce_op, output_index, propagate_nan, pooling_dim = trait_combo
+
+                kernel_name = (
+                    f"pool_{self.datatype}_{pooling_dim}_{reduce_op}_"
+                    f"{'idx' if output_index else 'noidx'}_"
+                    f"{'nan' if propagate_nan else 'nonan'}"
+                )
+
+                tile_str = (
+                    f"{tile_config['block_m']}x{tile_config['block_n']}_"
+                    f"{tile_config['warp_m']}x{tile_config['warp_n']}_"
+                    f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}_"
+                    f"{tile_config['thread_tile_m']}x{tile_config['thread_tile_n']}"
+                )
+
+                kernel_name += f"_{tile_str}"
+
+                trait_str = (
+                    f"{reduce_op}_"
+                    f"{'true' if output_index else 'false'}_"
+                    f"{'true' if propagate_nan else 'false'}_"
+                    f"{pooling_dim}"
+                )
+
+                kernel_list.append(
+                    {
+                        "name": kernel_name,
+                        "tile_config": tile_config,
+                        "trait_combo": trait_combo,
+                        "tile_str": tile_str,
+                        "trait_str": trait_str,
+                    }
+                )
+
+        # Write kernel count
+        with open(self.working_path / "pool_kernel_count.txt", "w") as f:
+            f.write(str(len(kernel_list)))
+
+        # Write kernel list
+        with open(self.working_path / "pool_kernel_list.txt", "w") as f:
+            for kernel in kernel_list:
+                f.write(
+                    f"{kernel['name']}|{kernel['tile_str']}|{kernel['trait_str']}\n"
+                )
+
+        print(f"Listed {len(kernel_list)} kernel configurations")
+
+    def generate_individual(self, num_workers=None):
+        """Generate individual kernel files with parallel processing"""
+        if num_workers is None:
+            num_workers = min(multiprocessing.cpu_count(), 8)
+
+        tile_configs = self._get_tile_configs()
+        trait_combos = self._generate_trait_combinations()
+
+        work_items = []
+        for tile_config in tile_configs:
+            for trait_combo in trait_combos:
+                work_items.append(
+                    (
+                        tile_config,
+                        trait_combo,
+                        self.working_path,
+                        self.datatype,
+                    )
+                )
+
+        print(
+            f"Generating {len(work_items)} individual kernel files using {num_workers} workers..."
+        )
+
+        kernel_list = []
+        completed = 0
+
+        with concurrent.futures.ProcessPoolExecutor(
+            max_workers=num_workers
+        ) as executor:
+            future_to_item = {
+                executor.submit(_generate_single_kernel_individual, item): item
+                for item in work_items
+            }
+
+            for future in concurrent.futures.as_completed(future_to_item):
+                completed += 1
+                if completed % 10 == 0 or completed == len(work_items):
+                    print(
+                        f"  Progress: {completed}/{len(work_items)} kernels generated"
+                    )
+
+                try:
+                    result = future.result()
+                    if result:
+                        kernel_list.append(result)
+                except Exception as exc:
+                    item = future_to_item[future]
+                    print(f"Kernel generation failed for {item}: {exc}")
+
+        kernel_list.sort(key=lambda x: x[0])
+        print(
+            f"Generated {len(kernel_list)} individual kernel files in {self.working_path}"
+        )
+
+    def run(self, num_workers=None):
+        """Run the builder to generate individual kernel files"""
+        self.generate_individual(num_workers)
+
+
+def _generate_single_kernel_individual(work_item):
+    """Worker function to generate a single individual kernel file"""
+    tile_config, trait_combo, working_path, datatype = work_item
+
+    builder = PoolingKernelBuilder(working_path, datatype)
+
+    try:
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            tile_config, trait_combo
+        )
+
+        header_file = working_path / f"pooling_single_{kernel_name}.hpp"
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        return (kernel_name, trait_combo, tile_config)
+    except Exception as e:
+        print(f"Error generating individual kernel: {e}")
+        return None
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser(
+        description="Pooling kernel instance builder for tile_engine"
+    )
+    parser.add_argument("--working_path", required=True, help="Working directory path")
+    parser.add_argument(
+        "--datatype",
+        required=True,
+        choices=["fp8", "fp16", "bf16", "fp32"],
+        help="Data type",
+    )
+    parser.add_argument("--config_json", help="Configuration JSON file")
+    parser.add_argument(
+        "--num_workers", type=int, help="Number of parallel workers (default: auto)"
+    )
+    parser.add_argument(
+        "--gen_individual", action="store_true", help="Generate individual kernel files"
+    )
+    parser.add_argument(
+        "--gen_single", action="store_true", help="Generate a single kernel file"
+    )
+    parser.add_argument("--kernel_name", help="Kernel name for single generation")
+    parser.add_argument(
+        "--tile_config", help="Tile configuration string for single generation"
+    )
+    parser.add_argument(
+        "--trait_combo", help="Trait combination string for single generation"
+    )
+    parser.add_argument(
+        "--list_kernels",
+        action="store_true",
+        help="List kernel configurations without generating files",
+    )
+
+    args = parser.parse_args()
+
+    builder = PoolingKernelBuilder(args.working_path, args.datatype, args.config_json)
+
+    if args.list_kernels:
+        builder.write_kernel_list()
+    elif args.gen_single:
+        if not args.kernel_name or not args.tile_config or not args.trait_combo:
+            parser.error(
+                "--gen_single requires --kernel_name, --tile_config, and --trait_combo"
+            )
+
+        # Parse tile config: "block_mx block_n_warp_mxwarp_n_warp_tile_mxwarp_tile_n_thread_tile_mxthread_tile_n"
+        tile_parts = args.tile_config.split("_")
+        block_dims = tile_parts[0].split("x")
+        warp_dims = tile_parts[1].split("x")
+        warp_tile_dims = tile_parts[2].split("x")
+        thread_tile_dims = tile_parts[3].split("x")
+
+        tile_config = {
+            "block_m": int(block_dims[0]),
+            "block_n": int(block_dims[1]),
+            "warp_m": int(warp_dims[0]),
+            "warp_n": int(warp_dims[1]),
+            "warp_tile_m": int(warp_tile_dims[0]),
+            "warp_tile_n": int(warp_tile_dims[1]),
+            "thread_tile_m": int(thread_tile_dims[0]),
+            "thread_tile_n": int(thread_tile_dims[1]),
+        }
+
+        # Parse trait combo: "reduce_op_output_index_propagate_nan_pooling_dim"
+        trait_parts = args.trait_combo.split("_")
+        trait_combo = (
+            trait_parts[0],  # reduce_op
+            trait_parts[1].lower() == "true",  # output_index
+            trait_parts[2].lower() == "true",  # propagate_nan
+            trait_parts[3],  # pooling_dim
+        )
+
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            tile_config, trait_combo
+        )
+
+        header_file = builder.working_path / f"pooling_single_{kernel_name}.hpp"
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        print(f"Generated {header_file}")
+
+    elif args.gen_individual:
+        builder.run(args.num_workers)
+    else:
+        parser.error(
+            "Must specify one of: --list_kernels, --gen_individual, or --gen_single"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tile_engine/ops/pooling/pooling_validation_utils.py b/tile_engine/ops/pooling/pooling_validation_utils.py
new file mode 100644
index 0000000000..27859064e4
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_validation_utils.py
@@ -0,0 +1,487 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Validation utilities for pooling tile_engine configurations.
+
+Validates tile configurations, trait combinations, and datatype support for
+pooling kernels.  Modelled after gemm_validation_utils.py — each constraint
+from the CK PoolShape / PoolKernel static_asserts is mirrored here so that
+invalid configs are rejected at code-generation time rather than at compile
+or runtime.
+"""
+
+import logging
+from typing import List, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Hardware constants
+# ---------------------------------------------------------------------------
+
+# Default warp size (wave64 for CDNA architectures)
+WARP_SIZE = 64
+MAX_BLOCK_SIZE = 1024  # Maximum threads per workgroup on AMD GPUs
+MAX_LDS_BYTES = 65536  # 64 KB LDS per workgroup
+
+def get_warp_size_for_gpu(gpu_target: str) -> int:
+    """Get the warp size for a given GPU target.
+
+    CDNA architectures (gfx9xx) use WAVE64 (64 threads per wavefront).
+    RDNA architectures (gfx10xx, gfx11xx, gfx12xx) use WAVE32 (32 threads per wavefront).
+    """
+    if gpu_target.startswith("gfx9"):
+        return 64  # CDNA - WAVE64
+    return 32  # RDNA and others - WAVE32
+
+# ---------------------------------------------------------------------------
+# Datatype helpers
+# ---------------------------------------------------------------------------
+
+ELEMENT_SIZE_MAP = {
+    "fp8": 1,
+    "bf8": 1,
+    "int8": 1,
+    "fp16": 2,
+    "bf16": 2,
+    "int4": 0.5,
+    "int32": 4,
+    "fp32": 4,
+    "fp64": 8,
+}
+
+DTYPE_STRING_MAP = {
+    "fp8": "ck_tile::fp8_t",
+    "bf8": "ck_tile::bf8_t",
+    "fp16": "ck_tile::fp16_t",
+    "bf16": "ck_tile::bf16_t",
+    "fp32": "float",
+    "fp64": "double",
+}
+
+SUPPORTED_DATATYPES = list(DTYPE_STRING_MAP.keys())
+
+# ---------------------------------------------------------------------------
+# Reduce-op helpers
+# ---------------------------------------------------------------------------
+
+REDUCE_OP_STRING_MAP = {
+    "max": "ck_tile::ReduceOp::Max",
+    "min": "ck_tile::ReduceOp::Min",
+    "avg": "ck_tile::ReduceOp::Add",
+}
+
+SUPPORTED_REDUCE_OPS = list(REDUCE_OP_STRING_MAP.keys())
+
+SUPPORTED_POOLING_DIMS = ("2d", "3d")
+
+# ---------------------------------------------------------------------------
+# Public helper functions (used by the instance builder)
+# ---------------------------------------------------------------------------
+
+
+def element_size(datatype: str) -> float:
+    """Return the byte-width of a single element for *datatype*."""
+    datatype = datatype.lower()
+    if datatype not in ELEMENT_SIZE_MAP:
+        raise ValueError(
+            f"Unsupported data type: '{datatype}'. "
+            f"Supported: {list(ELEMENT_SIZE_MAP.keys())}"
+        )
+    return ELEMENT_SIZE_MAP[datatype]
+
+
+def get_dtype_string(datatype: str) -> str:
+    """Return the C++ type string (e.g. ``ck_tile::fp16_t``) for *datatype*."""
+    return DTYPE_STRING_MAP.get(datatype, "float")
+
+
+def get_reduce_op_string(reduce_op: str) -> str:
+    """Return the C++ ReduceOp enumerator string for *reduce_op*."""
+    return REDUCE_OP_STRING_MAP.get(reduce_op, "ck_tile::ReduceOp::Max")
+
+
+# ---------------------------------------------------------------------------
+# Individual tile-config validators
+# ---------------------------------------------------------------------------
+
+
+def validate_positivity(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+) -> Tuple[bool, str]:
+    """All tile parameters must be positive integers."""
+    params = {
+        "block_m": block_m,
+        "block_n": block_n,
+        "warp_m": warp_m,
+        "warp_n": warp_n,
+        "warp_tile_m": warp_tile_m,
+        "warp_tile_n": warp_tile_n,
+        "thread_tile_m": thread_tile_m,
+        "thread_tile_n": thread_tile_n,
+    }
+    for name, val in params.items():
+        if val <= 0:
+            return False, f"{name} ({val}) must be > 0"
+    return True, ""
+
+
+def validate_power_of_two(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+) -> Tuple[bool, str]:
+    """All tile parameters should be powers of two for correct GPU addressing."""
+    params = {
+        "block_m": block_m,
+        "block_n": block_n,
+        "warp_m": warp_m,
+        "warp_n": warp_n,
+        "warp_tile_m": warp_tile_m,
+        "warp_tile_n": warp_tile_n,
+        "thread_tile_m": thread_tile_m,
+        "thread_tile_n": thread_tile_n,
+    }
+    for name, val in params.items():
+        if val > 0 and (val & (val - 1)) != 0:
+            return False, f"{name} ({val}) is not a power of two"
+    return True, ""
+
+
+def validate_thread_tile_alignment(
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+) -> Tuple[bool, str]:
+    """
+    Mirrors pool_shape.hpp:
+      static_assert(Warp_M % ThreadTile_M == 0);
+      static_assert(Warp_N % ThreadTile_N == 0);
+    """
+    if warp_tile_m % thread_tile_m != 0:
+        return (
+            False,
+            f"warp_tile_m ({warp_tile_m}) must be divisible by "
+            f"thread_tile_m ({thread_tile_m})",
+        )
+    if warp_tile_n % thread_tile_n != 0:
+        return (
+            False,
+            f"warp_tile_n ({warp_tile_n}) must be divisible by "
+            f"thread_tile_n ({thread_tile_n})",
+        )
+    return True, ""
+
+
+def validate_warp_thread_distribution(
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+    warp_size: int = WARP_SIZE,
+) -> Tuple[bool, str]:
+    """
+    Mirrors pool_shape.hpp:
+      static_assert((Warp_M * Warp_N / ThreadTile_M / ThreadTile_N)
+                    % get_warp_size() == 0);
+    """
+    threads_per_warp = (warp_tile_m * warp_tile_n) // (thread_tile_m * thread_tile_n)
+    if threads_per_warp % warp_size != 0:
+        return (
+            False,
+            f"(warp_tile_m * warp_tile_n) / (thread_tile_m * thread_tile_n) = "
+            f"{threads_per_warp} is not a multiple of warp_size ({warp_size})",
+        )
+    return True, ""
+
+
+def _compute_warp_size_scale_factors(
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+    warp_size: int = WARP_SIZE,
+) -> Tuple[int, int]:
+    """
+    Reproduce the WarpSizeScaleFactor_M / _N logic from pool_shape.hpp.
+    """
+    threads_per_warp = (warp_tile_m * warp_tile_n) // (thread_tile_m * thread_tile_n)
+    scale = threads_per_warp // warp_size
+
+    if warp_tile_m // thread_tile_m > warp_tile_n // thread_tile_n:
+        return scale, 1
+    return 1, scale
+
+
+def validate_block_tile_coverage(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+    warp_size: int = WARP_SIZE,
+) -> Tuple[bool, str]:
+    """
+    Mirrors pool_shape.hpp:
+      static_assert((Block_M * WarpSizeScaleFactor_M) %
+                    (WarpPerBlock_M * Warp_M) == 0);
+      static_assert((Block_N * WarpSizeScaleFactor_N) %
+                    (WarpPerBlock_N * Warp_N) == 0);
+    """
+    sf_m, sf_n = _compute_warp_size_scale_factors(
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n, warp_size
+    )
+
+    if (block_m * sf_m) % (warp_m * warp_tile_m) != 0:
+        return (
+            False,
+            f"block_m*ScaleFactor_M ({block_m}*{sf_m}={block_m * sf_m}) must be "
+            f"divisible by warp_m*warp_tile_m ({warp_m}*{warp_tile_m}"
+            f"={warp_m * warp_tile_m})",
+        )
+    if (block_n * sf_n) % (warp_n * warp_tile_n) != 0:
+        return (
+            False,
+            f"block_n*ScaleFactor_N ({block_n}*{sf_n}={block_n * sf_n}) must be "
+            f"divisible by warp_n*warp_tile_n ({warp_n}*{warp_tile_n}"
+            f"={warp_n * warp_tile_n})",
+        )
+    return True, ""
+
+
+def validate_block_size(
+    warp_m: int,
+    warp_n: int,
+    warp_size: int = WARP_SIZE,
+) -> Tuple[bool, str]:
+    """BlockSize = warp_size * warp_m * warp_n must be <= MAX_BLOCK_SIZE."""
+    block_size = warp_size * warp_m * warp_n
+    if block_size > MAX_BLOCK_SIZE:
+        return (
+            False,
+            f"BlockSize ({block_size} = {warp_size}*{warp_m}*{warp_n}) "
+            f"exceeds maximum ({MAX_BLOCK_SIZE})",
+        )
+    return True, ""
+
+
+def validate_vector_load_alignment(
+    block_m: int,
+    thread_tile_m: int,
+    in_datatype: str,
+) -> Tuple[bool, str]:
+    """
+    The M-dimension thread-tile determines the contiguous vector load width.
+    It must produce a load whose byte-width divides 16 bytes (max global
+    vector load width on AMD GPUs) and is at least 1 element wide.
+    """
+    elem_bytes = element_size(in_datatype)
+    load_bytes = thread_tile_m * elem_bytes
+    if load_bytes > 16:
+        return (
+            False,
+            f"thread_tile_m ({thread_tile_m}) * element_size({in_datatype}, "
+            f"{elem_bytes}B) = {load_bytes}B exceeds 16B max vector load",
+        )
+    if 16 % load_bytes != 0 and load_bytes % 16 != 0:
+        return (
+            False,
+            f"Vector load width ({load_bytes}B) is not a divisor of 16B",
+        )
+    return True, ""
+
+
+def validate_repeat_factors(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+) -> Tuple[bool, str]:
+    """
+    Repeat_M and Repeat_N from pool_shape.hpp must be >= 1.  They are the
+    number of tile iterations each warp performs within the block.
+    """
+    sf_m, sf_n = _compute_warp_size_scale_factors(
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n
+    )
+    repeat_m = (block_m * sf_m) // (warp_m * warp_tile_m)
+    repeat_n = (block_n * sf_n) // (warp_n * warp_tile_n)
+    if repeat_m < 1:
+        return False, f"Repeat_M ({repeat_m}) must be >= 1"
+    if repeat_n < 1:
+        return False, f"Repeat_N ({repeat_n}) must be >= 1"
+    return True, ""
+
+
+# ---------------------------------------------------------------------------
+# Comprehensive tile-config validation (entry point)
+# ---------------------------------------------------------------------------
+
+
+def is_tile_config_valid(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+    in_datatype: str,
+    out_datatype: str,
+    fast_mode: bool = False,
+    gpu_target: str = "gfx90a",
+) -> bool:
+    """
+    Comprehensive pooling tile configuration validation.
+
+    When *fast_mode* is True only cheap sanity checks are performed (useful
+    for the ``--list_kernels`` path).  Full mode mirrors every
+    ``static_assert`` in ``pool_shape.hpp``.
+
+    Parameters
+    ----------
+    block_m, block_n : Block tile dimensions (M = output elems, N = window).
+    warp_m, warp_n   : Warps per block along each dimension.
+    warp_tile_m, warp_tile_n : Tile processed per warp.
+    thread_tile_m, thread_tile_n : Contiguous elements per thread.
+    in_datatype  : Input element type (e.g. ``"fp16"``).
+    out_datatype : Output element type.
+    fast_mode    : Skip expensive checks when True.
+    """
+    all_params = (
+        block_m, block_n, warp_m, warp_n,
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n,
+    )
+
+    # --- Positivity (always) ---
+    ok, err = validate_positivity(*all_params)
+    if not ok:
+        logger.debug(f"Positivity check failed: {err}")
+        return False
+
+    # --- Thread-tile alignment (always) ---
+    ok, err = validate_thread_tile_alignment(
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n
+    )
+    if not ok:
+        logger.debug(f"Thread tile alignment failed: {err}")
+        return False
+
+    if fast_mode:
+        return True
+
+    # Get the warp size for this GPU target
+    warp_size = get_warp_size_for_gpu(gpu_target)
+
+    # --- Power-of-two ---
+    ok, err = validate_power_of_two(*all_params)
+    if not ok:
+        logger.debug(f"Power-of-two check failed: {err}")
+        return False
+
+    # --- Warp-thread distribution ---
+    ok, err = validate_warp_thread_distribution(
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n, warp_size
+    )
+    if not ok:
+        logger.debug(f"Warp thread distribution failed: {err}")
+        return False
+
+    # --- Block-tile coverage ---
+    ok, err = validate_block_tile_coverage(*all_params, warp_size=warp_size)
+    if not ok:
+        logger.debug(f"Block tile coverage failed: {err}")
+        return False
+
+    # --- Block size ---
+    ok, err = validate_block_size(warp_m, warp_n, warp_size)
+    if not ok:
+        logger.debug(f"Block size check failed: {err}")
+        return False
+
+    # --- Repeat factors ---
+    ok, err = validate_repeat_factors(*all_params)
+    if not ok:
+        logger.debug(f"Repeat factor check failed: {err}")
+        return False
+
+    # --- Vector load alignment ---
+    ok, err = validate_vector_load_alignment(block_m, thread_tile_m, in_datatype)
+    if not ok:
+        logger.debug(f"Vector load alignment failed: {err}")
+        return False
+
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Trait-combination validation
+# ---------------------------------------------------------------------------
+
+
+def is_trait_combination_valid(
+    reduce_op: str,
+    output_index: bool,
+    propagate_nan: bool,
+    pooling_dim: str,
+) -> bool:
+    """
+    Validate a pooling trait combination.
+
+    Parameters
+    ----------
+    reduce_op    : ``"max"``, ``"min"``, or ``"avg"``.
+    output_index : Whether to output indices of the selected elements.
+    propagate_nan: Whether to propagate NaN values through the reduction.
+    pooling_dim  : ``"2d"`` or ``"3d"``.
+    """
+    if reduce_op not in SUPPORTED_REDUCE_OPS:
+        logger.debug(f"Unsupported reduce_op: '{reduce_op}'")
+        return False
+
+    if pooling_dim not in SUPPORTED_POOLING_DIMS:
+        logger.debug(f"Invalid pooling dimension: '{pooling_dim}'")
+        return False
+
+    # output_index only makes sense for max pooling (CK constraint)
+    if output_index and reduce_op != "max":
+        logger.debug(
+            f"output_index=True is only supported for 'max' pooling, "
+            f"not '{reduce_op}'"
+        )
+        return False
+
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Datatype validation
+# ---------------------------------------------------------------------------
+
+
+def is_datatype_supported(datatype: str) -> bool:
+    """Return True if *datatype* is a known pooling datatype."""
+    return datatype.lower() in ELEMENT_SIZE_MAP
diff --git a/tile_engine/ops/reduce/CMakeLists.txt b/tile_engine/ops/reduce/CMakeLists.txt
index fa62890a5c..6cb5db239a 100644
--- a/tile_engine/ops/reduce/CMakeLists.txt
+++ b/tile_engine/ops/reduce/CMakeLists.txt
@@ -11,7 +11,7 @@ set(MULTI_REDUCE_VARIANTS "multiops_multiblock;multiops_threadwise" CACHE STRING
 function(build_multi_reduce_for_datatype datatype variant)
     # Filter GPU targets to only gfx942, and gfx950
     set(GPU_TARGETS "")
-    set(DESIRED_TARGETS "gfx942;gfx950")
+    set(DESIRED_TARGETS "gfx942;gfx950;gfx12-generic")
     set(VALID_VARIANTS "multiops_multiblock;multiops_threadwise")
     
     foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
@@ -22,11 +22,11 @@ function(build_multi_reduce_for_datatype datatype variant)
     
     # Skip compilation if no matching targets found
     if(NOT GPU_TARGETS)
-        message(WARNING "Skipping Tile Engine for Multi Reduction Kernel: No supported GPU targets (gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+        message(WARNING "Skipping Tile Engine for Multi Reduction Kernel: No supported GPU targets (gfx942, gfx950, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
         return()
     endif()
     
-    message(STATUS "Building Reduction for GPU targets: ${GPU_TARGETS}")
+    message(VERBOSE "Building Reduction for GPU targets: ${GPU_TARGETS}")
     
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${variant}")
     file(MAKE_DIRECTORY "${working_path}")
@@ -75,7 +75,7 @@ function(build_multi_reduce_for_datatype datatype variant)
         message(FATAL_ERROR "Failed to generate kernels for ${datatype} ${variant}: ${ret}")
     endif()
 
-    message(STATUS "Generated ${datatype} ${variant} reduction kernel blobs at: ${working_path}")
+    message(VERBOSE "Generated ${datatype} ${variant} reduction kernel blobs at: ${working_path}")
 
     # # Add test executables for each generated test
     file(STRINGS "${working_path}/reduce_${variant}_blobs_list.txt" test_basenames)
@@ -85,7 +85,7 @@ function(build_multi_reduce_for_datatype datatype variant)
         set(test_src "${working_path}/${test_base}.cpp")
         set(test_target "${test_base}")
 
-        add_executable(${test_target} ${test_src})
+        add_executable(${test_target} EXCLUDE_FROM_ALL ${test_src})
         target_include_directories(${test_target} PRIVATE
             "${CMAKE_SOURCE_DIR}/test/ck_tile/reduce/"
             ${working_path}