diff --git a/.gitignore b/.gitignore
index 7a70c76072..04ac34466f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -114,3 +114,28 @@ experimental/grouped_convolution_tile_instances/instances/*
 !experimental/grouped_convolution_tile_instances/instances/*.inc
 !experimental/grouped_convolution_tile_instances/instances/*.hpp
 experimental/grouped_convolution_tile_instances/*.inc
+# Heuristics: benchmark data (never in git)
+dispatcher/heuristics/data/
+
+# Heuristics: experimental/training artifacts (exclude from git)
+dispatcher/heuristics/models/**/oof_predictions.parquet
+dispatcher/heuristics/models/**/cv_metrics_*.json
+dispatcher/heuristics/models/**/eval_report.json
+dispatcher/heuristics/models/**/feature_importances_*.json
+dispatcher/heuristics/models/**/model_tflops_ihem.lgbm
+dispatcher/heuristics/models/**/model_tflops_log.lgbm
+dispatcher/heuristics/models/**/model_tflops_log_big.lgbm
+
+# Heuristics: keep in git (production model files):
+#   models/{op}_{dtype}_{arch}/model_tflops.lgbm
+#   models/{op}_{dtype}_{arch}/model_latency.lgbm
+#   models/{op}_{dtype}_{arch}/model_bandwidth.lgbm
+#   models/{op}_{dtype}_{arch}/feature_spec.json
+#   models/{op}_{dtype}_{arch}/train_manifest.json
+
+# Heuristics: logs and caches
+dispatcher/heuristics/*.log
+dispatcher/heuristics/__pycache__/
+dispatcher/heuristics/tests/__pycache__/
+dispatcher/heuristics/.pytest_cache/
+
diff --git a/Dockerfile b/Dockerfile
index f19bc69362..de129d0703 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,22 +2,33 @@
 FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=7.1.1
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
+ARG TARBALL_URL=https://rocm.nightlies.amd.com/tarball/therock-dist-linux-gfx90X-dcgpu-7.12.0a20260218.tar.gz
 ARG compiler_version=""
 ARG compiler_commit=""
-ARG CK_SCCACHE=""
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH=$PATH:/opt/rocm/bin
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib
+ENV HIP_PLATFORM=amd
 
 # Add rocm repository
 RUN set -xe && \
     apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 
-RUN wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb && \
-    apt install ./amdgpu-install_7.1.1.70101-1_all.deb -y && \
-    apt update && \
-    apt install python3-setuptools python3-wheel -y && \
-    apt install rocm-dev -y
+RUN if [ "$compiler_version" = "therock" ]; then \
+        rm -rf /opt/rocm && mkdir /opt/rocm && \
+        echo "Downloading ROCm tarball from $TARBALL_URL..." && \
+        wget -q -O /tmp/rocm.tar.gz "$TARBALL_URL" && \
+        echo "Extracting tarball to /opt/rocm..." && \
+        tar -xzf /tmp/rocm.tar.gz -C /opt/rocm --strip-components=1 ; \
+    else echo "using the release compiler" && \
+        wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb && \
+        apt install ./amdgpu-install_7.1.1.70101-1_all.deb -y && \
+        apt update && \
+        apt install python3-setuptools python3-wheel -y && \
+        apt install rocm-dev -y; \
+    fi
     
 # Install SCCACHE
 ENV SCCACHE_VERSION="0.14.0"
@@ -34,7 +45,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     build-essential \
     cmake \
     git \
-    hip-rocclr \
     iputils-ping \
     jq \
     libelf-dev \
@@ -44,8 +54,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     net-tools \
     pkg-config \
     python3-full \
+    python3-pip \
     redis \
-    rocm-llvm-dev \
     sshpass \
     stunnel \
     software-properties-common \
@@ -88,26 +98,3 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     git clone -b master https://github.com/ROCm/rocm-cmake.git  && \
     cd rocm-cmake && mkdir build && cd build && \
     cmake  .. && cmake --build . && cmake --build . --target install
-
-WORKDIR /
-# Add alternative compilers, if necessary
-ENV compiler_version=$compiler_version
-ENV compiler_commit=$compiler_commit
-RUN sh -c "echo compiler version = '$compiler_version'" && \
-    sh -c "echo compiler commit = '$compiler_commit'"
-
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
-        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
-        cd llvm-project && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 8 ; \
-    else echo "using the release compiler"; \
-    fi
-
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
-        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
-        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 8 ; \
-    else echo "using the release compiler"; \
-    fi
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
index c27e016903..9d1e54106e 100644
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -9,7 +9,7 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'" && \
     sh -c "echo compiler commit = '$compiler_commit'"
 
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
+RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] ) && [ "$compiler_commit" = "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
         cd llvm-project && git log -1 && mkdir build && cd build && \
         cmake -G Ninja \
@@ -43,7 +43,7 @@ RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-sta
     else echo "using the release compiler"; \
     fi
 
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
+RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] ) && [ "$compiler_commit" != "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
         cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
         cmake -G Ninja \
diff --git a/Dockerfile.manylinux b/Dockerfile.manylinux
index 2c0bec2840..bfbe847b1d 100644
--- a/Dockerfile.manylinux
+++ b/Dockerfile.manylinux
@@ -3,7 +3,6 @@ ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=7.2
 ARG compiler_version=""
 ARG compiler_commit=""
-ARG CK_SCCACHE=""
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 ENV DEBIAN_FRONTEND=noninteractive
@@ -19,16 +18,15 @@ RUN wget https://repo.radeon.com/amdgpu-install/7.2/rhel/8.10/amdgpu-install-7.2
     dnf install python3-setuptools python3-wheel -y && \
     dnf install rocm-dev -y
 
-## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
-ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
+# Install SCCACHE
+ENV SCCACHE_VERSION="0.14.0"
 ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
 ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
-ENV CK_SCCACHE=$CK_SCCACHE
-RUN if [ "$CK_SCCACHE" != "" ]; then \
-        mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
-        curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
-        chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache; \
-    fi
+RUN set -x && \
+    mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
+    wget -qO sccache.tar.gz https://github.com/mozilla/sccache/releases/latest/download/sccache-v$SCCACHE_VERSION-x86_64-unknown-linux-musl.tar.gz && \
+    tar -xzf sccache.tar.gz --strip-components=1 -C ${SCCACHE_INSTALL_LOCATION} && \
+    chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
 
 # Install dependencies
 RUN dnf update -y && DEBIAN_FRONTEND=noninteractive dnf install -y \
@@ -83,19 +81,71 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'" && \
     sh -c "echo compiler commit = '$compiler_commit'"
 
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
+RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] ) && [ "$compiler_commit" = "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
         cd llvm-project && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 8 ; \
+        cmake -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang;lld;clang-tools-extra;flang;mlir" \
+        -DLLVM_LIT_ARGS="-vv --show-unsupported --show-xfail -j 32" \
+        -DPACKAGE_VENDOR="AMD" \
+        -DCMAKE_INSTALL_PREFIX=/home/$USER/rocm/pure_llvm_1.0 \
+        -DLLVM_ENABLE_ASSERTIONS=ON \
+        -DLLVM_BUILD_DOCS=ON \
+        -DLLVM_TARGETS_TO_BUILD=all \
+        -DLIBOMPTARGET_ENABLE_DEBUG=ON \
+        -DOFFLOAD_ENABLE_EMISSARY_APIS=OFF \
+        -DCLANG_DEFAULT_LINKER=lld \
+        -DCLANG_DEFAULT_PIE_ON_LINUX=0 \
+        -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;openmp;compiler-rt;libunwind;flang-rt" \
+        -DLIBCXX_ENABLE_SHARED=OFF \
+        -DLIBCXX_ENABLE_STATIC=ON \
+        -DLIBCXX_INSTALL_LIBRARY=OFF \
+        -DLIBCXX_INSTALL_HEADERS=OFF \
+        -DLIBCXXABI_ENABLE_SHARED=OFF \
+        -DLIBCXXABI_ENABLE_STATIC=ON \
+        -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF \
+        -DLLVM_ENABLE_ASSERTIONS=1 \
+        -DLLVM_ENABLE_Z3_SOLVER=OFF \
+        -DLLVM_ENABLE_ZLIB=ON \
+        -DLLVM_LINK_LLVM_DYLIB=OFF \
+        -DCLANG_LINK_CLANG_DYLIB=OFF \
+        ../llvm && \
+        ninja -j16 ; \
     else echo "using the release compiler"; \
     fi
 
-RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
+RUN if ( [ "$compiler_version" = "develop" ] || [ "$compiler_version" = "amd-staging" ] ) && [ "$compiler_commit" != "" ]; then \
         git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
         cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
-        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
-        make -j 8 ; \
+        cmake -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang;lld;clang-tools-extra;flang;mlir" \
+        -DLLVM_LIT_ARGS="-vv --show-unsupported --show-xfail -j 32" \
+        -DPACKAGE_VENDOR="AMD" \
+        -DCMAKE_INSTALL_PREFIX=/home/$USER/rocm/pure_llvm_1.0 \
+        -DLLVM_ENABLE_ASSERTIONS=ON \
+        -DLLVM_BUILD_DOCS=ON \
+        -DLLVM_TARGETS_TO_BUILD=all \
+        -DLIBOMPTARGET_ENABLE_DEBUG=ON \
+        -DOFFLOAD_ENABLE_EMISSARY_APIS=OFF \
+        -DCLANG_DEFAULT_LINKER=lld \
+        -DCLANG_DEFAULT_PIE_ON_LINUX=0 \
+        -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;openmp;compiler-rt;libunwind;flang-rt" \
+        -DLIBCXX_ENABLE_SHARED=OFF \
+        -DLIBCXX_ENABLE_STATIC=ON \
+        -DLIBCXX_INSTALL_LIBRARY=OFF \
+        -DLIBCXX_INSTALL_HEADERS=OFF \
+        -DLIBCXXABI_ENABLE_SHARED=OFF \
+        -DLIBCXXABI_ENABLE_STATIC=ON \
+        -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF \
+        -DLLVM_ENABLE_ASSERTIONS=1 \
+        -DLLVM_ENABLE_Z3_SOLVER=OFF \
+        -DLLVM_ENABLE_ZLIB=ON \
+        -DLLVM_LINK_LLVM_DYLIB=OFF \
+        -DCLANG_LINK_CLANG_DYLIB=OFF \
+        ../llvm && \
+        ninja -j16 ; \
     else echo "using the release compiler"; \
     fi
 
diff --git a/Jenkinsfile b/Jenkinsfile
index 8df0980cb3..3569d8b267 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -421,16 +421,19 @@ def buildDocker(install_prefix){
     def image_name = getDockerImageName()
     def base_image_name = getBaseDockerImageName()
     echo "Building Docker for ${image_name}"
-    def dockerArgs = "--build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    if(params.COMPILER_VERSION == "develop" || params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
+    def dockerArgs = "--build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+    if(params.COMPILER_VERSION == "develop" || params.COMPILER_VERSION == "amd-staging" || params.COMPILER_COMMIT != ""){
         dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f projects/composablekernel/Dockerfile.compiler . "
     }
+    else if(params.COMPILER_VERSION == "therock"){
+        dockerArgs = dockerArgs + " --no-cache -f projects/composablekernel/Dockerfile . "
+    }
     else if(params.RUN_AITER_TESTS){
         image_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
         dockerArgs = dockerArgs + " --no-cache -f projects/composablekernel/Dockerfile.aiter --build-arg AITER_BRANCH='${params.aiter_branch}' --build-arg CK_AITER_BRANCH='${params.ck_aiter_branch}' . "
     }
      else if(params.RUN_PYTORCH_TESTS){
-        image_name = "${env.CK_DOCKERHUB}:ck_pytorch"
+        image_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_pytorch"
         dockerArgs = dockerArgs + " --no-cache -f projects/composablekernel/Dockerfile.pytorch --build-arg CK_PYTORCH_BRANCH='${params.ck_pytorch_branch}' . "
     }
    else{
@@ -470,10 +473,10 @@ def get_docker_options(){
     else{ //only add kfd and dri paths if you actually going to run somthing on GPUs
         dockerOpts = "--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
     }
-    if (params.COMPILER_VERSION == "develop" || params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
+    if (params.COMPILER_VERSION == "develop" || params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "therock" || params.COMPILER_COMMIT != ""){
     // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
     // newer clang22 compilers and running with older hip runtima libraries
-        dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 "
+        dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 --env HIP_PLATFORM=amd "
     }
     // on some machines the group ids for video and render groups may not be the same as in the docker image!
     def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
@@ -1140,7 +1143,7 @@ def run_pytorch_tests(Map conf=[:]){
     show_node_info()
     checkoutComposableKernel()
     //use the latest pytorch-nightly image
-    def image = "${env.CK_DOCKERHUB}:ck_pytorch"
+    def image = "${env.CK_DOCKERHUB_PRIVATE}:ck_pytorch"
     def dockerOpts=get_docker_options() + ' --group-add irc '
 
     gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'rocm-libraries') {
@@ -1183,16 +1186,19 @@ CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_
                                               0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_BASIC_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX101=false;BUILD_GFX908=false;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true;BUILD_PACKAGES=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=develop;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
-                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
+                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=therock;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
                                               0 15 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
                                               0 13 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;FORCE_CI=true
                                               0 11 * * * % RUN_FULL_CONV_TILE_TESTS=true;RUN_AITER_TESTS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;FORCE_CI=true
                                               0 9 * * * % RUN_PYTORCH_TESTS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;BUILD_GFX101=false;BUILD_GFX103=false;BUILD_GFX11=false;BUILD_GFX12=false;BUILD_GFX90A=false;FORCE_CI=true''' : ""
 
+POLL_SPEC = BRANCH_NAME == "develop" ? 'H H/6 * * *' : ''
+
 pipeline {
     agent none
     triggers {
         parameterizedCron(CRON_SETTINGS)
+        pollSCM(POLL_SPEC)
     }
     options {
         skipDefaultCheckout()
@@ -1214,7 +1220,7 @@ pipeline {
         string(
             name: 'COMPILER_VERSION',
             defaultValue: '',
-            description: 'Specify which version of compiler to use: release, develop, amd-staging, amd-mainline, or leave blank (default).')
+            description: 'Specify which version of compiler to use: develop, amd-staging, therock, or leave blank (default).')
         string(
             name: 'COMPILER_COMMIT',
             defaultValue: '',
diff --git a/dispatcher/README.md b/dispatcher/README.md
index d1ca299d78..1395285d60 100644
--- a/dispatcher/README.md
+++ b/dispatcher/README.md
@@ -154,6 +154,8 @@ rocminfo | grep -i "gfx"
 
 ### Install Python Dependencies
 
+#### Core Dependencies (Required)
+
 NumPy is required for Python examples and kernel generation. We recommend using a virtual environment:
 
 **Option 1: Using standard venv**
@@ -165,8 +167,8 @@ python3 -m venv .venv
 source .venv/bin/activate  # Linux/macOS
 # .venv\Scripts\activate   # Windows
 
-# Install NumPy
-pip install numpy
+# Install core dependencies
+pip install -r python/requirements.txt
 ```
 
 **Option 2: Using uv (faster alternative)**
@@ -179,17 +181,38 @@ uv venv .venv
 source .venv/bin/activate  # Linux/macOS
 # .venv\Scripts\activate   # Windows
 
-# Install NumPy
-uv pip install numpy
+# Install core dependencies
+uv pip install -r python/requirements.txt
 ```
 
 **Option 3: System-wide install (not recommended)**
 ```bash
-pip install numpy
+pip install -r python/requirements.txt
 ```
 
 > **Note:** Always activate your virtual environment before running CMake or Python examples.
 
+#### ML Heuristics Dependencies (Optional)
+
+For ML-based kernel selection (examples 09-11), install additional dependencies:
+
+```bash
+# Activate your virtual environment first
+source .venv/bin/activate
+
+# Install ML dependencies (LightGBM, pandas, pyarrow, scikit-learn)
+pip install -r requirements-ml.txt
+```
+
+**Why separate?** ML dependencies are large (especially pyarrow) and not needed for basic dispatcher usage. Install only if you need:
+- ML-based kernel selection (`examples/gemm/python/09_ml_heuristic.py`)
+- Model training (`heuristics/train.py`)
+- Model evaluation (`heuristics/evaluate.py`)
+- Automated benchmark analysis
+
+**Core dependencies:** ~50 MB (NumPy only)
+**With ML dependencies:** ~500 MB (includes LightGBM, pandas, pyarrow, scikit-learn)
+
 ### Supported Data Types
 
 CK Tile supports a wide range of data types for GEMM operations:
@@ -470,6 +493,42 @@ python3 examples/gemm/python/10_advanced_benchmark.py \
 
 ---
 
+## ML-Based Kernel Selection (Optional)
+
+The dispatcher includes ML heuristics for automated kernel selection using trained LightGBM models.
+
+**Prerequisites:** Install ML dependencies first:
+
+```bash
+pip install -r requirements-ml.txt  # ~500 MB (LightGBM, pandas, pyarrow, scikit-learn)
+```
+
+**Documentation:** See [heuristics/README.md](heuristics/README.md) for:
+- Training and evaluating models
+- Feature engineering (72 features)
+- Using pre-trained models
+- Python API reference
+
+**Examples:**
+```bash
+python3 examples/gemm/python/09_ml_heuristic.py      # ML-based kernel selection
+python3 examples/gemm/python/10_rank_kernels.py      # Kernel ranking
+```
+
+**Model Compression:** Trained models are stored in compressed `.lgbm.gz` format to save space (~67% size reduction). Python tools automatically decompress models on first use. For C++ examples, decompress manually:
+
+```bash
+# If you have compressed models
+cd heuristics/models/gemm_universal_fp16_gfx950
+gunzip model_tflops.lgbm.gz
+
+# Then use in C++ example
+cd ../../../build
+./gemm_09_ml_heuristic --model ../heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm
+```
+
+---
+
 ## External Integration
 
 ### Using Dispatcher in Your Own Project
diff --git a/dispatcher/examples/CMakeLists.txt b/dispatcher/examples/CMakeLists.txt
index 0359eb0d8d..bda8eb0372 100644
--- a/dispatcher/examples/CMakeLists.txt
+++ b/dispatcher/examples/CMakeLists.txt
@@ -346,6 +346,55 @@ add_declarative_gpu_example(gemm_04_heuristics           gemm/cpp/04_heuristics.
 add_declarative_gpu_example(gemm_05_json_export          gemm/cpp/05_json_export.cpp)
 add_declarative_gpu_example(gemm_06_multi_registry       gemm/cpp/06_multi_registry.cpp)
 
+# ML Heuristic example -- requires LightGBM shared library
+# Derive site-packages from active Python interpreter (respects virtualenvs)
+find_package(Python3 COMPONENTS Interpreter)
+
+set(LIGHTGBM_SEARCH_PATHS)
+if(Python3_FOUND AND Python3_EXECUTABLE)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_path('purelib'))"
+        OUTPUT_VARIABLE PYTHON_SITE_PACKAGES
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    if(PYTHON_SITE_PACKAGES)
+        list(APPEND LIGHTGBM_SEARCH_PATHS "${PYTHON_SITE_PACKAGES}/lightgbm/lib")
+    endif()
+endif()
+
+# Fallback to common Python 3.x site-packages if auto-detection failed
+if(NOT PYTHON_SITE_PACKAGES)
+    list(APPEND LIGHTGBM_SEARCH_PATHS
+        "$ENV{HOME}/.local/lib/python3.12/site-packages/lightgbm/lib"
+    )
+endif()
+
+find_library(LIGHTGBM_LIB NAMES LightGBM lib_lightgbm _lightgbm
+    HINTS ${CMAKE_PREFIX_PATH}
+    PATHS ${LIGHTGBM_SEARCH_PATHS}
+    NO_DEFAULT_PATH
+    DOC "LightGBM shared library for ML heuristics"
+)
+
+# Fallback: search default paths (respects LightGBM_DIR if set by user)
+if(NOT LIGHTGBM_LIB)
+    find_library(LIGHTGBM_LIB NAMES LightGBM lib_lightgbm)
+endif()
+
+if(LIGHTGBM_LIB)
+    add_declarative_gpu_example(gemm_09_ml_heuristic gemm/cpp/09_ml_heuristic.cpp)
+    target_link_libraries(gemm_09_ml_heuristic PRIVATE ${LIGHTGBM_LIB})
+    message(STATUS "LightGBM found: ${LIGHTGBM_LIB} -- building gemm_09_ml_heuristic")
+else()
+    message(STATUS "LightGBM not found -- skipping gemm_09_ml_heuristic")
+    message(STATUS "  To enable ML heuristic example:")
+    message(STATUS "    1. Activate virtualenv: source .venv/bin/activate")
+    message(STATUS "    2. Install: pip install -r ../requirements-ml.txt")
+    message(STATUS "    3. Reconfigure: cmake ..")
+    message(STATUS "  Or set CMAKE_PREFIX_PATH or LightGBM_DIR to LightGBM location")
+endif()
+
 # =============================================================================
 # GEMM Python Library - Single Fallback Kernel
 # =============================================================================
diff --git a/dispatcher/examples/gemm/cpp/09_ml_heuristic.cpp b/dispatcher/examples/gemm/cpp/09_ml_heuristic.cpp
new file mode 100644
index 0000000000..cec6d1cd02
--- /dev/null
+++ b/dispatcher/examples/gemm/cpp/09_ml_heuristic.cpp
@@ -0,0 +1,211 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * Example 09: ML-Based Kernel Selection (Native C++)
+ *
+ * Uses a trained LightGBM model loaded via the C API to predict TFLOPS
+ * for each kernel in the registry and select the best one. The kernels
+ * are JIT-compiled at build time via DECL_KERNEL_SET (same as other examples).
+ *
+ * Build: cd dispatcher/build && cmake .. && make gemm_09_ml_heuristic
+ * Run:   ./gemm_09_ml_heuristic --model <path_to_model.lgbm>
+ */
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <chrono>
+
+#include "ck_tile/dispatcher.hpp"
+#include "ck_tile/dispatcher/kernel_decl.hpp"
+#include "ck_tile/dispatcher/example_args.hpp"
+#include "ck_tile/dispatcher/ml_heuristic.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::utils;
+using Signature = decl::Signature;
+using Algorithm = decl::Algorithm;
+
+// Multiple kernel configs for ML to choose from
+DECL_KERNEL_SET(ml_kernels,
+                // Small tiles
+                .add(Signature().dtype("fp16").layout("rcr"),
+                     Algorithm()
+                         .tile(64, 64, 32)
+                         .wave(2, 2, 1)
+                         .warp(16, 16, 16)
+                         .pipeline("compv3")
+                         .scheduler("intrawave")
+                         .epilogue("cshuffle"),
+                     "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(64, 64, 64)
+                             .wave(2, 2, 1)
+                             .warp(16, 16, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    // Medium tiles
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(128, 128, 32)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(128, 128, 64)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(128, 128, 64)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv4")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    // Large tiles
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(256, 256, 32)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(256, 128, 32)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942")
+                    .add(Signature().dtype("fp16").layout("rcr"),
+                         Algorithm()
+                             .tile(128, 256, 32)
+                             .wave(2, 2, 1)
+                             .warp(32, 32, 16)
+                             .pipeline("compv3")
+                             .scheduler("intrawave")
+                             .epilogue("cshuffle"),
+                         "gfx942"));
+
+int main(int argc, char* argv[])
+{
+    ExampleArgs args("Example 09: ML-Based Kernel Selection",
+                     "Uses trained LightGBM model for kernel selection");
+    args.add_option("--arch", "gfx942", "GPU architecture");
+    args.add_option("--model", "", "Path to LightGBM model file (.lgbm)");
+    args.add_option("--log_transform", "false", "Model uses log1p transform");
+
+    if(!args.parse(argc, argv))
+        return 0;
+
+    print_header("Example 09: ML-Based Kernel Selection");
+
+    std::string gfx_arch   = args.get("--arch", "gfx942");
+    std::string model_path = args.get("--model", "");
+    bool log_transform     = (args.get("--log_transform", "false") == "true");
+
+    if(model_path.empty())
+    {
+        std::cerr << "Error: --model <path> is required" << std::endl;
+        std::cerr << "Usage: ./gemm_09_ml_heuristic --model path/to/model_tflops.lgbm" << std::endl;
+        return 1;
+    }
+
+    // Setup Registry (kernels are JIT compiled from DECL_KERNEL_SET above)
+    Registry registry;
+    REGISTER_GENERATED_KERNELS(registry, gfx_arch);
+    std::cout << "Registry: " << registry.size() << " kernel(s)" << std::endl;
+
+    // Load ML model and create heuristic
+    HardwareProfile hw;
+    MLHeuristic ml_heuristic(model_path, &registry, hw, log_transform);
+    if(!ml_heuristic.is_loaded())
+    {
+        std::cerr << "Failed to load model. Exiting." << std::endl;
+        return 1;
+    }
+
+    // Wire ML heuristic into dispatcher
+    Dispatcher dispatcher(&registry);
+    dispatcher.set_strategy(Dispatcher::SelectionStrategy::Heuristic);
+    dispatcher.set_heuristic([&ml_heuristic](const Problem& p) { return ml_heuristic(p); });
+
+    std::cout << "Strategy: ML Heuristic (LightGBM)" << std::endl;
+
+    // Test with different problem sizes
+    using DataType                               = ck_tile::fp16_t;
+    std::vector<std::tuple<int, int, int>> sizes = {
+        {128, 128, 64},
+        {512, 512, 256},
+        {1024, 1024, 512},
+        {2048, 2048, 1024},
+    };
+
+    std::cout << std::endl
+              << std::setw(20) << "Shape" << std::setw(30) << "Selected Kernel" << std::setw(15)
+              << "Pred TFLOPS" << std::setw(12) << "Select ms" << std::setw(10) << "Status"
+              << std::endl;
+    std::cout << std::string(87, '-') << std::endl;
+
+    bool all_passed = true;
+
+    for(const auto& [M, N, K] : sizes)
+    {
+        Problem problem;
+        problem.M       = M;
+        problem.N       = N;
+        problem.K       = K;
+        problem.k_batch = 1;
+
+        auto t0          = std::chrono::high_resolution_clock::now();
+        auto kernel      = dispatcher.select_kernel(problem);
+        auto t1          = std::chrono::high_resolution_clock::now();
+        double select_ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+
+        std::string size_str =
+            std::to_string(M) + "x" + std::to_string(N) + "x" + std::to_string(K);
+
+        if(!kernel)
+        {
+            std::cout << std::setw(20) << size_str << std::setw(30) << "NONE" << std::setw(15)
+                      << "N/A" << std::setw(12) << std::fixed << std::setprecision(2) << select_ms
+                      << std::setw(10) << "FAIL" << std::endl;
+            all_passed = false;
+            continue;
+        }
+
+        double pred      = ml_heuristic.predict_tflops(problem, kernel->get_key());
+        std::string name = kernel->get_key().encode_identifier();
+        if(name.length() > 27)
+            name = name.substr(0, 27) + "..";
+
+        std::cout << std::setw(20) << size_str << std::setw(30) << name << std::setw(15)
+                  << std::fixed << std::setprecision(2) << pred << std::setw(12)
+                  << std::setprecision(2) << select_ms << std::setw(10) << "OK" << std::endl;
+    }
+
+    std::cout << std::endl
+              << (all_passed ? "*** ALL TESTS PASSED ***" : "*** SOME TESTS FAILED ***")
+              << std::endl;
+
+    return all_passed ? 0 : 1;
+}
diff --git a/dispatcher/examples/gemm/python/09_ml_heuristic.py b/dispatcher/examples/gemm/python/09_ml_heuristic.py
new file mode 100644
index 0000000000..d6726a2033
--- /dev/null
+++ b/dispatcher/examples/gemm/python/09_ml_heuristic.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example 09: ML-Based Kernel Selection
+
+Uses a trained LightGBM model to select the optimal kernel for each problem
+size. The model predicts TFLOPS for every candidate in the kernel pool and
+picks the highest-scoring one, which is then JIT-compiled and run.
+
+This replaces the hand-crafted rules in 08_heuristics.py with a data-driven
+approach achieving 97-98% of oracle-best TFLOPS efficiency.
+
+Complexity: *****
+
+Prerequisites:
+    - Trained model in dispatcher/heuristics/models/gemm_universal_fp8_gfx950/
+    - lightgbm, pandas, numpy, pyarrow installed
+
+Usage:
+    python3 09_ml_heuristic.py
+    python3 09_ml_heuristic.py --dtype fp16 --arch gfx942
+"""
+
+import sys
+import argparse
+import time
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "heuristics"))
+
+import numpy as np
+
+from ctypes_utils import (
+    KernelConfig,
+    setup_gemm_dispatcher,
+    cleanup_gemm,
+)
+
+from predict import Predictor
+
+
+@dataclass
+class KernelSpec:
+    """Kernel specification -- same structure as 08_heuristics.py"""
+
+    name: str
+    tile_m: int
+    tile_n: int
+    tile_k: int
+    pipeline: str = "compv3"
+    scheduler: str = "intrawave"
+    wave_m: int = 2
+    wave_n: int = 2
+    wave_k: int = 1
+    warp_m: int = 32
+    warp_n: int = 32
+    warp_k: int = 16
+
+
+# Kernel pool: representative configs spanning small to large tiles,
+# compv3/compv4/mem pipelines, and intrawave/interwave schedulers.
+KERNEL_POOL = [
+    # Small tiles
+    KernelSpec("s_64x64_k32_v3", 64, 64, 32, "compv3", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k64_v3", 64, 64, 64, "compv3", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k128_v3", 64, 64, 128, "compv3", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k32_v4", 64, 64, 32, "compv4", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k64_mem", 64, 64, 64, "mem", warp_m=16, warp_n=16),
+    KernelSpec("s_64x64_k128_mem", 64, 64, 128, "mem", warp_m=16, warp_n=16),
+    # Medium tiles
+    KernelSpec("m_128x128_k32_v3", 128, 128, 32, "compv3"),
+    KernelSpec("m_128x128_k64_v3", 128, 128, 64, "compv3"),
+    KernelSpec("m_128x128_k128_v3", 128, 128, 128, "compv3"),
+    KernelSpec("m_128x128_k32_v4", 128, 128, 32, "compv4"),
+    KernelSpec("m_128x128_k64_v4", 128, 128, 64, "compv4"),
+    KernelSpec("m_128x128_k64_mem", 128, 128, 64, "mem"),
+    KernelSpec("m_128x128_k128_mem", 128, 128, 128, "mem"),
+    # Rectangular medium
+    KernelSpec("r_64x128_k32", 64, 128, 32, "compv3", warp_m=16),
+    KernelSpec("r_128x64_k32", 128, 64, 32, "compv3", warp_n=16),
+    KernelSpec("r_64x128_k64", 64, 128, 64, "compv3", warp_m=16),
+    KernelSpec("r_128x64_k64", 128, 64, 64, "compv3", warp_n=16),
+    # Large tiles
+    KernelSpec("l_256x128_k32", 256, 128, 32, "compv3"),
+    KernelSpec("l_128x256_k32", 128, 256, 32, "compv3"),
+    KernelSpec("l_256x256_k32", 256, 256, 32, "compv3"),
+    KernelSpec("l_256x256_k64", 256, 256, 64, "compv3"),
+    # Interwave variants
+    KernelSpec("m_128x128_k64_iw", 128, 128, 64, "compv3", "interwave"),
+    KernelSpec("m_128x128_k64_mem_iw", 128, 128, 64, "mem", "interwave"),
+]
+
+
+def spec_to_feature_dict(spec: KernelSpec, dtype: str, layout: str) -> dict:
+    """Convert a KernelSpec to the dict format the feature engine expects.
+
+    Note: pad_m/n/k default to True to match KernelConfig defaults and actual
+    compiled kernels. This ensures the ML model receives the correct padding
+    flags that will be used during JIT compilation.
+    """
+    return {
+        "kernel_name": spec.name,
+        "tile_m": spec.tile_m,
+        "tile_n": spec.tile_n,
+        "tile_k": spec.tile_k,
+        "warp_m": spec.wave_m,
+        "warp_n": spec.wave_n,
+        "warp_k": spec.wave_k,
+        "warp_tile_m": spec.warp_m,
+        "warp_tile_n": spec.warp_n,
+        "warp_tile_k": spec.warp_k,
+        "pipeline": spec.pipeline,
+        "scheduler": spec.scheduler,
+        "epilogue": "cshuffle",
+        "pad_m": True,  # Match KernelConfig default
+        "pad_n": True,  # Match KernelConfig default
+        "pad_k": True,  # Match KernelConfig default
+        "persistent": False,
+        "dtype": dtype,
+        "layout": layout,
+    }
+
+
+def spec_to_kernel_config(spec: KernelSpec, dtype: str, arch: str) -> KernelConfig:
+    """Convert a KernelSpec to the dispatcher's KernelConfig for JIT compilation."""
+    return KernelConfig(
+        dtype_a=dtype,
+        dtype_b=dtype,
+        dtype_c=dtype,
+        dtype_acc="fp32",
+        layout_a="row",
+        layout_b="col",
+        layout_c="row",
+        tile_m=spec.tile_m,
+        tile_n=spec.tile_n,
+        tile_k=spec.tile_k,
+        wave_m=spec.wave_m,
+        wave_n=spec.wave_n,
+        wave_k=spec.wave_k,
+        warp_m=spec.warp_m,
+        warp_n=spec.warp_n,
+        warp_k=spec.warp_k,
+        pipeline=spec.pipeline,
+        scheduler=spec.scheduler,
+        epilogue="cshuffle",
+        gfx_arch=arch,
+    )
+
+
+def ml_select_kernel(
+    predictor: Predictor,
+    pool: List[KernelSpec],
+    M: int,
+    N: int,
+    K: int,
+    dtype: str,
+    layout: str,
+) -> tuple:
+    """Score all kernels in the pool and return (best_spec, predicted_tflops)."""
+    problem = {"m": M, "n": N, "k": K, "dtype": dtype, "layout": layout, "split_k": 1}
+    kernel_dicts = [spec_to_feature_dict(s, dtype, layout) for s in pool]
+
+    ranked = predictor.rank_kernels(problem, kernel_dicts)
+    if not ranked:
+        return pool[0], 0.0
+
+    best_name, best_tflops = ranked[0]
+    best_spec = next((s for s in pool if s.name == best_name), pool[0])
+    return best_spec, best_tflops
+
+
+def main():
+    parser = argparse.ArgumentParser(description="ML-based kernel selection for GEMM")
+    parser.add_argument("--dtype", default="fp16", choices=["fp16", "bf16", "fp8"])
+    parser.add_argument("--arch", default="gfx942")
+    parser.add_argument(
+        "--model_dir",
+        default=str(
+            Path(__file__).parent.parent.parent.parent
+            / "heuristics"
+            / "models"
+            / "gemm_universal_fp8_gfx950"
+        ),
+    )
+    parser.add_argument(
+        "--no_run", action="store_true", help="Only predict, don't run GEMMs"
+    )
+    args = parser.parse_args()
+
+    print("=" * 75)
+    print("  Example 09: ML-Based Kernel Selection")
+    print("=" * 75)
+    print(f"\n  Model:  {args.model_dir}")
+    print(f"  Dtype:  {args.dtype}")
+    print(f"  Arch:   {args.arch}")
+    print(f"  Pool:   {len(KERNEL_POOL)} kernels")
+
+    predictor = Predictor(args.model_dir)
+    print("  Model loaded successfully")
+
+    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float16
+
+    test_sizes = [
+        (128, 128, 64),
+        (256, 256, 128),
+        (512, 512, 256),
+        (1024, 1024, 512),
+        (2048, 2048, 1024),
+    ]
+
+    header = f"{'Shape':<20} {'Selected Kernel':<25} {'Pred TFLOPS':>12}"
+    if not args.no_run:
+        header += f" {'Time (ms)':>10} {'TFLOPS':>10} {'Status':<8}"
+    print(f"\n  {header}")
+    print("  " + "-" * len(header))
+
+    results = []
+
+    for M, N, K in test_sizes:
+        t0 = time.time()
+        best_spec, pred_tflops = ml_select_kernel(
+            predictor, KERNEL_POOL, M, N, K, args.dtype, "rcr"
+        )
+        _ = (time.time() - t0) * 1000  # ML selection time (unused)
+
+        size_str = f"{M}x{N}x{K}"
+        line = f"  {size_str:<20} {best_spec.name:<25} {pred_tflops:>12.2f}"
+
+        if args.no_run:
+            print(line)
+            results.append((size_str, best_spec.name, True, 0, pred_tflops))
+            continue
+
+        config = spec_to_kernel_config(best_spec, args.dtype, args.arch)
+
+        setup = setup_gemm_dispatcher(
+            config=config,
+            registry_name=f"ml_{best_spec.name}",
+            verbose=False,
+            auto_rebuild=True,
+        )
+
+        if not setup.success:
+            line += f" {'N/A':>10} {'N/A':>10} {'BUILD':>8}"
+            print(line)
+            results.append((size_str, best_spec.name, False, 0, 0))
+            cleanup_gemm()
+            continue
+
+        dispatcher = setup.dispatcher
+        if not dispatcher.is_supported(M, N, K):
+            line += f" {'N/A':>10} {'N/A':>10} {'UNSUP':>8}"
+            print(line)
+            results.append((size_str, best_spec.name, False, 0, 0))
+            cleanup_gemm()
+            continue
+
+        np.random.seed(42)
+        A = (np.random.randn(M, K) * 0.1).astype(np_dtype)
+        B = (np.random.randn(K, N) * 0.1).astype(np_dtype)
+
+        result = dispatcher.run(A, B, M, N, K)
+
+        if result.success:
+            C_ref = np.matmul(A.astype(np.float32), B.astype(np.float32)).astype(
+                np_dtype
+            )
+            max_err = np.max(np.abs(result.output - C_ref))
+            passed = max_err < 1e-2
+            status = "PASS" if passed else "FAIL"
+            line += f" {result.time_ms:>10.4f} {result.tflops:>10.2f} {status:<8}"
+            results.append(
+                (size_str, best_spec.name, passed, result.time_ms, result.tflops)
+            )
+        else:
+            line += f" {'N/A':>10} {'N/A':>10} {'FAIL':<8}"
+            results.append((size_str, best_spec.name, False, 0, 0))
+
+        print(line)
+        cleanup_gemm()
+
+    # Summary
+    print("\n" + "=" * 75)
+    print("  SUMMARY")
+    print("=" * 75)
+    passed = sum(1 for r in results if r[2])
+    print(f"\n  Results: {passed}/{len(results)} tests passed")
+    valid = [r for r in results if r[2] and r[4] > 0]
+    if valid:
+        avg = sum(r[4] for r in valid) / len(valid)
+        print(f"  Average TFLOPS: {avg:.2f}")
+    if passed == len(results):
+        print("\n  *** ALL TESTS PASSED ***")
+    print("=" * 75)
+    return 0 if passed == len(results) else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/heuristics/.gitignore b/dispatcher/heuristics/.gitignore
new file mode 100644
index 0000000000..d9523255bf
--- /dev/null
+++ b/dispatcher/heuristics/.gitignore
@@ -0,0 +1,60 @@
+# Python bytecode and caches
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+
+# Jupyter notebooks
+*.ipynb
+.ipynb_checkpoints/
+
+# Virtual environments
+.venv/
+venv/
+ENV/
+
+# IDE and editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Test output and logs
+*.log
+test_output.log
+custom_shapes_gpu_test.log
+
+# Benchmark and analysis output files
+*.csv
+*.json
+!models/*/feature_spec.json
+!models/*/train_manifest.json
+
+# Data files (parquet, arrow)
+*.parquet
+*.arrow
+
+# Temporary and NFS files
+.nfs*
+*.tmp
+*.bak
+
+# Decompressed model files (compressed .lgbm.gz versions are tracked)
+models/**/*.lgbm
+
+# User-specific test and analysis scripts
+test_*.py
+!tests/test_*.py
+find_*.py
+oracle_*.json
+validation_results_*.csv
+custom_shapes_*.csv
+fp16_bf16_*.csv
+
+# Ignore all markdown files except tracked documentation
+*.md
+!DATA_GENERATION.md
+!LEARNINGS.md
+!README.md
diff --git a/dispatcher/heuristics/DATA_GENERATION.md b/dispatcher/heuristics/DATA_GENERATION.md
new file mode 100644
index 0000000000..819e77fe48
--- /dev/null
+++ b/dispatcher/heuristics/DATA_GENERATION.md
@@ -0,0 +1,412 @@
+# Data Generation Guide
+
+This document explains how to build benchmark binaries from the CK Tile engine,
+generate benchmark datasets, and manage them for the ML kernel performance
+prediction system.
+
+## Overview
+
+The ML heuristic needs benchmark data: measured TFLOPS, latency, and bandwidth
+for every (problem shape, kernel config) pair. The tile engine builds one
+executable per kernel configuration. Each executable benchmarks a single kernel
+on a given problem size and outputs JSON with performance metrics.
+
+```
+CK source  -->  CMake configure  -->  ninja build  -->  benchmark binaries
+                                                          (4608 per op/dtype/layout)
+
+benchmark binaries  -->  run on GPU  -->  streaming log  -->  parquet dataset
+                          (per shape)       (JSON blocks)      (canonical schema)
+```
+
+## Prerequisites
+
+- **ROCm**: HIP >= 6.0.3 (for gfx950: HIP >= 6.0.4)
+- **Build tools**: CMake >= 3.21, Ninja, HIP-aware clang compiler
+- **Python**: 3.10+ with `pandas`, `pyarrow`
+- **GPU**: ROCm-capable AMD GPU (MI250X, MI300X, MI355X, etc.)
+
+---
+
+## Part 1: Building Benchmark Binaries from the Tile Engine
+
+If you already have pre-built binaries (e.g., in `/workspace/ck_tile/bin/`),
+skip to Part 2. This section explains how to build them from source.
+
+### Step 1: CMake Configure
+
+From the CK repository root:
+
+```bash
+cmake -S /workspace/rocm-libraries/projects/composablekernel \
+      -B build \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DGPU_TARGETS="gfx950" \
+      -DGEMM_UNIVERSAL_DATATYPE="fp8" \
+      -DGEMM_UNIVERSAL_LAYOUT="rcr" \
+      -G Ninja
+```
+
+**Key CMake variables:**
+
+| Variable | Default | Description |
+|---|---|---|
+| `GPU_TARGETS` | (required) | Target GPU architectures. Supported: `gfx90a`, `gfx942`, `gfx950`, `gfx1201`. Semicolon-separated for multiple. |
+| `GEMM_UNIVERSAL_DATATYPE` | `"fp8;fp16"` | Data types to build. Options: `fp8`, `fp16`, `bf16`, `bf8`. Semicolon-separated. |
+| `GEMM_UNIVERSAL_LAYOUT` | `"rcr;rrr;crr;ccr"` | Layouts to build. Semicolon-separated. |
+| `GEMM_UNIVERSAL_CONFIG_FILE` | `"default_config.json"` | Kernel config file (in the `configs/` directory). Controls which tile sizes, warp configs, pipelines, etc. are enumerated. |
+| `ENABLE_CCACHE_GEMM_UNIVERSAL` | `OFF` | Enable ccache for faster rebuilds. |
+
+**Example: build only fp8 RCR for gfx950 (fastest, ~4608 kernels):**
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release \
+      -DGPU_TARGETS="gfx950" \
+      -DGEMM_UNIVERSAL_DATATYPE="fp8" \
+      -DGEMM_UNIVERSAL_LAYOUT="rcr" \
+      -G Ninja
+```
+
+**Example: build all dtypes and layouts (slow, ~4608 * 4 * 4 = ~73K kernels):**
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release \
+      -DGPU_TARGETS="gfx950" \
+      -DGEMM_UNIVERSAL_DATATYPE="fp8;fp16;bf16;bf8" \
+      -DGEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" \
+      -G Ninja
+```
+
+### What happens during configure
+
+1. CMake calls `gemm_universal_instance_builder.py --list_kernels` to enumerate
+   all valid kernel configurations from the config JSON.
+2. It writes `gemm_universal_kernel_list.txt` (one kernel per line) and
+   `gemm_universal_kernel_count.txt` to the build directory.
+3. For each kernel, it creates a ninja build target.
+
+### Step 2: Build
+
+```bash
+# Build all benchmarks for the configured dtypes/layouts
+ninja -C build benchmark_gemm_universal_all
+
+# Or build a specific dtype/layout combo
+ninja -C build benchmark_gemm_universal_fp8_rcr
+
+# Or build by pipeline type
+ninja -C build benchmark_gemm_universal_compv4_pipeline
+ninja -C build benchmark_gemm_universal_mem_pipeline
+
+# Or build a single specific kernel
+ninja -C build benchmark_gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128
+```
+
+**Build time estimates:**
+- ~4608 kernels (one dtype, one layout): 1-4 hours depending on CPU cores
+- Use `-j <N>` to control parallelism: `ninja -C build -j 32 benchmark_gemm_universal_fp8_rcr`
+
+### Step 3: Verify binaries
+
+Binaries are placed in `build/bin/`:
+
+```bash
+ls build/bin/benchmark_gemm_universal_fp8_rcr_* | wc -l
+# Expected: 4608 (for default config)
+
+# Test one binary
+./build/bin/benchmark_gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128 \
+    -m=1024 -n=1024 -k=1024 -warmup=3 -repeat=10 -verify=0
+```
+
+### Kernel config files
+
+The config files live in:
+```
+tile_engine/ops/gemm/gemm_universal/configs/
+  default_config.json       # Default: full enumeration
+  default_ci_config.json    # CI: reduced set for fast testing
+  user_provided_config.json # Custom: your own subset
+```
+
+To use a custom config:
+```bash
+cmake ... -DGEMM_UNIVERSAL_CONFIG_FILE="user_provided_config.json"
+```
+
+The config controls which tile sizes (e.g., 128x128x64, 256x256x32), warp
+configurations (e.g., 2x2x1, 1x4x1), pipelines (compv3, compv4, mem),
+schedulers, and other parameters are included in the kernel enumeration.
+
+### Building StreamK / other ops
+
+The same pattern applies to other tile engine ops:
+
+```bash
+# StreamK
+ninja -C build benchmark_gemm_streamk_fp8_rcr
+
+# Grouped convolution
+ninja -C build benchmark_grouped_conv_fwd_fp16_nhwgc
+```
+
+Each op has its own instance builder and config directory.
+
+---
+
+## Part 2: Running Benchmarks and Generating Data
+
+## Quick Start
+
+### 1. Run benchmarks for a set of shapes
+
+Each binary accepts `-m=`, `-n=`, `-k=`, `-warmup=`, `-repeat=`, `-verify=` flags
+and outputs JSON to stdout:
+
+```bash
+/workspace/ck_tile/bin/benchmark_gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128 \
+    -m=1024 -n=1024 -k=1024 -warmup=3 -repeat=10 -verify=0
+```
+
+Output:
+```json
+{
+  "name": "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_...",
+  "problem": {
+    "split_k": 1, "m": 1024, "n": 1024, "k": 1024,
+    "dtype_a": "fp8", "dtype_b": "fp8", ...
+  },
+  "perf_result": {
+    "latency(ms)": 0.04,
+    "tflops(TFlops)": 204.60,
+    "bandwidth(GB/s)": 624.39
+  }
+}
+```
+
+### 2. Batch generation using provided scripts
+
+**Wide coverage (diverse shapes across all regimes):**
+```bash
+python3 generate_wide_coverage.py \
+    --bin_dir /workspace/ck_tile/bin \
+    --out_dir data/wide_coverage \
+    --batch_size 25 \
+    --warmup 3 --repeat 10
+```
+
+**Edge-case dimensions (N=1, K=1, small N/K):**
+```bash
+python3 generate_edge_dims.py
+```
+
+Both scripts write streaming log files that `data_pipeline.py` can parse.
+
+### 3. Parse logs into parquet
+
+```bash
+python3 data_pipeline.py <log_file> \
+    -o data/my_dataset.parquet \
+    --arch gfx950 \
+    --capture_hw
+```
+
+The `--capture_hw` flag runs `rocminfo` once and injects the GPU hardware
+profile (CU count, clock speed, cache sizes, etc.) into every row.
+
+## Canonical Data Schema
+
+Every parquet file follows this schema:
+
+| Column | Type | Description |
+|---|---|---|
+| `op_type` | str | `gemm_universal`, `gemm_streamk`, etc. |
+| `dtype` | str | `fp8`, `fp16`, `bf16`, `bf8` |
+| `layout` | str | `rcr`, `rrr`, `crr`, `ccr` |
+| `arch` | str | `gfx942`, `gfx950`, etc. |
+| `kernel_name` | str | Full kernel identifier |
+| `m`, `n`, `k` | int | Problem dimensions |
+| `split_k` | int | Split-K factor (1 = standard) |
+| `measured_tflops` | float | Ground-truth TFLOPS |
+| `latency_ms` | float | Measured latency |
+| `bandwidth_gb_s` | float | Measured bandwidth |
+| `is_valid` | bool | True if tflops > 0 and latency > 0 |
+| `tile_m`, `tile_n`, `tile_k` | int | Tile dimensions |
+| `warp_m`, `warp_n`, `warp_k` | int | Warp config |
+| `warp_tile_m/n/k` | int | Warp tile dimensions |
+| `pipeline` | str | `compv3`, `compv4`, `mem`, etc. |
+| `scheduler` | str | `intrawave`, `interwave` |
+| `epilogue` | str | `cshuffle`, `default` |
+| `pad_m`, `pad_n`, `pad_k` | bool | Padding flags |
+| `persistent` | bool | Persistent kernel flag |
+| `run_id` | str | Unique collection run identifier |
+
+## Shape Selection Guidelines
+
+Good training data requires diverse shapes. Cover all of these regimes:
+
+### By M dimension (batch size / output rows)
+- **M=1**: single-token inference (hardest case for tiling)
+- **Tiny M (2-16)**: small batch inference
+- **Small M (32-128)**: medium batch
+- **Medium M (256-2048)**: large batch / training
+- **Large M (4096-20480)**: very large batch
+
+### By N and K dimension
+- **N=1**: vector-matrix multiply (degenerate)
+- **K=1**: rank-1 update / outer product (degenerate)
+- **Small N or K (2-16)**: stress tile efficiency
+- **Deep K (K > 4096)**: compute-bound regime
+- **Shallow K (K < 256)**: memory-bound regime
+
+### By shape family
+- **Square**: M ~ N ~ K (powers of 2)
+- **Tall**: M >> N (tall output matrix)
+- **Wide**: N >> M (wide output matrix)
+- **Deep-K**: K >> M and K >> N
+
+### Special cases
+- **Prime dimensions**: 17, 31, 127, 251, 509, 1021, 2039, 4093
+  (worst-case for tile alignment, tests padding logic)
+- **Non-power-of-2**: 48, 96, 192, 384, 576, 768, 1536, 3072, 4608
+  (common in LLM architectures)
+- **LLM inference shapes**: DeepSeek, LLaMA-7B, LLaMA-70B MLP/attention dims
+
+### Minimum recommended coverage
+
+For a production-quality model, aim for:
+- At least 200 unique (M, N, K) shapes
+- At least 10 shapes per shape family
+- All kernel configs (4608 for fp8 RCR) run against every shape
+- Multiple layouts if training a cross-layout model
+
+## Benchmark Quality Guidelines
+
+### Warmup and repeat
+- Minimum `warmup=3`, `repeat=10` for fast iteration
+- Production quality: `warmup=5`, `repeat=20` for stable measurements
+- The `perf_result` values are averaged over `repeat` iterations
+
+### Noise handling
+- Use **median** latency when aggregating multiple runs of the same benchmark
+- Flag measurements where coefficient of variation exceeds 10%
+- Avoid benchmarking under thermal throttling (check GPU temperature)
+- Lock GPU clocks if possible for reproducibility
+
+### Environment metadata
+Store with every dataset:
+- GPU model and architecture (from `rocminfo`)
+- ROCm driver version
+- Clock mode (default / locked)
+- Git hash of the CK tile engine build (if available)
+- Timestamp
+
+## Adding Data for a New Op
+
+To generate benchmark data for a new operation (e.g., `gemm_streamk`):
+
+1. **Build the binaries** using the tile engine:
+   ```bash
+   ninja -C build benchmark_gemm_streamk_fp8_rcr
+   ```
+
+2. **Write a generation script** (or modify `generate_wide_coverage.py`):
+   - Change the executable glob pattern to match the new op
+   - Add any op-specific CLI flags the binaries need
+
+3. **Run and parse**:
+   ```bash
+   python3 data_pipeline.py my_streamk_run.log \
+       -o data/gemm_streamk_fp8_gfx950.parquet --arch gfx950
+   ```
+
+4. **Train**:
+   ```bash
+   python3 train.py --op gemm_streamk --dtype fp8 --arch gfx950 \
+       --data_dir data/ --out_dir models/gemm_streamk_fp8_gfx950
+   ```
+
+## Adding Data for a New Layout
+
+Same binaries, same shapes -- just change the layout filter:
+
+```bash
+# Build rrr binaries
+ninja -C build benchmark_gemm_universal_fp8_rrr
+
+# Generate and parse
+# ... (same flow, different bin_dir or executable glob)
+
+# Train a cross-layout model by putting all layouts in the same data_dir
+python3 train.py --data_dir data/ --out_dir models/gemm_universal_fp8_gfx950_all_layouts
+```
+
+The feature engine includes `layout` as a categorical feature, so one model
+can handle all layouts.
+
+## Incremental Data Collection
+
+When you have a trained model and want to add more data:
+
+1. Generate new data (new shapes, new layouts, etc.)
+2. Parse into parquet alongside existing data
+3. Warm-start from the previous model:
+   ```bash
+   python3 train.py --data_dir data/ --out_dir models/v2 \
+       --warm_start models/v1 \
+       --warm_start_n_estimators 200
+   ```
+
+This adds 200 new trees on top of the existing model. The feature schema
+must match exactly (enforced automatically).
+
+## File Organization
+
+Recommended directory structure:
+
+```
+heuristics/
+  data/
+    gemm_universal_fp8_rcr_gfx950.parquet      # original 108 shapes
+    wide_coverage/                               # batch log files
+      wide_coverage_batch_001.log
+      wide_coverage_batch_002.log
+      ...
+    edge_dims/                                   # N=1, K=1 edge cases
+      edge_dims_batch_001.log
+      ...
+  models/
+    gemm_universal_fp8_gfx950/                  # trained model artifacts
+      model_tflops.lgbm
+      model_latency.lgbm
+      model_bandwidth.lgbm
+      feature_spec.json
+      train_manifest.json
+      cv_metrics_tflops.json
+      eval_report.json
+      ...
+```
+
+## Troubleshooting
+
+### Benchmark binary exits with non-zero code
+Some kernel configs are invalid for certain problem sizes (e.g., tile_m=256
+with M=16). The data pipeline marks these as `is_valid=False` and they are
+filtered out during training. This is expected.
+
+### Edge dims produce very few results
+N=1 and K=1 shapes are degenerate -- most kernel configurations have minimum
+dimension requirements and will fail or produce zero TFLOPS. The small number
+of valid results is still useful (it tells the model which configs work for
+these shapes).
+
+### Benchmarks are slow
+Each shape requires running all 4608 kernel executables sequentially. At
+~0.01s per kernel, that is ~46 seconds per shape. For 700 shapes, expect
+~9 hours. Tips:
+- Run on a dedicated GPU (no other workloads)
+- Use `--batch_size 25` to get incremental output
+- Parse and train on partial data while generation continues
+
+### Data from different GPUs / driver versions
+Store `run_id` and hardware metadata with each dataset. Training on mixed
+data is allowed but not recommended for production models. Filter to a
+single `run_id` or `arch` for clean experiments.
diff --git a/dispatcher/heuristics/LEARNINGS.md b/dispatcher/heuristics/LEARNINGS.md
new file mode 100644
index 0000000000..dba3514601
--- /dev/null
+++ b/dispatcher/heuristics/LEARNINGS.md
@@ -0,0 +1,151 @@
+# Learnings and Design Decisions
+
+Empirical findings from building the CK Tile kernel performance prediction system.
+These inform the current defaults and explain why certain approaches were chosen.
+
+## 1. Log-Transform is Essential for Cross-Scale Accuracy
+
+**Problem**: GEMM TFLOPS spans 5 orders of magnitude across different problem
+sizes. When training on raw TFLOPS, the regression loss (RMSE) is dominated by
+large shapes where absolute errors are biggest. The model learns to predict
+large shapes accurately but ignores tiny shapes where the TFLOPS values are
+much lower.
+
+**Evidence** (168 shapes, 626K rows, 5-fold GroupKFold CV):
+
+
+| Model                         | Mean Eff   | P10 Eff    | tiny_m Eff | Min Eff    |
+| ----------------------------- | ---------- | ---------- | ---------- | ---------- |
+| Raw TFLOPS (500 trees)        | 92.73%     | 80.24%     | 84.55%     | 4.26%      |
+| **log1p(TFLOPS)** (500 trees) | **96.92%** | **94.34%** | **94.89%** | **60.27%** |
+| log1p(TFLOPS) (2000 trees)    | 97.51%     | 93.89%     | 96.04%     | 63.56%     |
+
+
+**Solution**: Train on `log1p(measured_tflops)` and apply `expm1()` to
+predictions. This is now the default in `train.py`. Pass `--no_log_transform`
+to revert to raw regression (not recommended).
+
+**Why log1p, not log**: `log1p(x) = log(1 + x)` handles zero and near-zero
+TFLOPS gracefully, whereas `log(x)` produces -inf for x=0.
+
+## 2. Tiny-M Shapes are the Hardest Case
+
+M=1 (single-token inference) shapes are fundamentally different from batch shapes:
+
+- Most kernel configurations produce very low TFLOPS
+- The "best" kernel is often only marginally better than the rest
+- The oracle performance itself is very low, so any prediction error tanks efficiency
+- Many kernels fail outright (tile_m=128 with M=1 wastes 127/128 of the tile)
+
+The bottom shapes in our evaluation are all M=1, with efficiencies in the
+63-70% range. These shapes have such low absolute performance that the model's
+noise floor exceeds the performance difference between kernels.
+
+**Mitigation**: Log-transform helps significantly (tiny_m improved from 84% to
+96%). For production use with M=1, consider a dedicated fallback (e.g.,
+hardcoded kernel selection for M < 4 based on known-good configs).
+
+## 3. IHEM (Hard Example Mining) Hurts When Scale is the Issue
+
+We tried Iterative Hard Example Mining with sample reweighting (2x-5x weight
+on hard shapes). Result: it made things **worse**, degrading mean efficiency
+from 94.31% to 92.90% over 3 iterations.
+
+**Why**: The hard shapes are hard because of scale mismatch, not because the
+model lacks capacity. Reweighting amplifies the small-TFLOPS rows, which
+distorts the learned relationship between features and performance for the
+majority of shapes. The log-transform was the correct fix -- it addresses the
+root cause (scale) rather than the symptom (bad predictions on tiny shapes).
+
+**Lesson**: IHEM is useful when the model has capacity gaps (e.g., certain
+pipeline types are underrepresented). It is counterproductive when the issue
+is target-variable scale. Always try target transforms before reweighting.
+
+## 4. GroupKFold Key = (M, N, K) Forces Generalization
+
+The validation uses `GroupKFold` where the group key is `(M, N, K)` -- all
+kernels for the same shape go to the same fold. This means:
+
+- The model is always evaluated on shapes it has **never seen** during training
+- Layout is excluded from the key, forcing the model to generalize across layouts
+- Since models are per-arch, `arch` is implicit (constant within one training run)
+
+This is much stricter than random row splitting, where the model would see some
+kernels for each shape during training. Our efficiency numbers are conservative
+estimates of real-world performance on unseen shapes.
+
+## 5. Model Size vs Accuracy Tradeoff
+
+
+| Config             | Trees    | Leaves  | LR       | Mean Eff   | P10 Eff    | Train Time    |
+| ------------------ | -------- | ------- | -------- | ---------- | ---------- | ------------- |
+| Small (default v1) | 500      | 127     | 0.05     | 96.92%     | 94.34%     | ~20s          |
+| **Big (current)**  | **2000** | **255** | **0.02** | **97.51%** | **93.89%** | **~25s/fold** |
+
+
+The bigger model improved mean efficiency by 0.6% but P10 didn't improve
+(actually slightly worse). The extra capacity helps on medium shapes but
+doesn't crack the tiny-M floor. This suggests the feature set, not model
+capacity, is the limiting factor for the hardest shapes.
+
+For C++ deployment, the bigger model (2000 trees, 255 leaves) is still fast
+enough -- LightGBM inference is O(trees * log(leaves)) per sample, which is
+~microseconds even at 2000 trees.
+
+## 6. N=1 and K=1 Shapes are Degenerate
+
+We generated benchmark data for 546 edge-case shapes (N=1, K=1, small N/K).
+Result: **zero valid kernel results** across 94 shapes. All 4608 kernels either
+fail or produce 0 TFLOPS for these degenerate dimensions.
+
+This means:
+
+- The tile engine kernels have hard minimum dimension requirements
+- N=1 / K=1 shapes cannot be handled by the current kernel set
+- These shapes need dedicated kernels (e.g., BLAS-1/BLAS-2 fallbacks)
+- The ML model should not be expected to handle them -- they should be filtered
+out before reaching the heuristic
+
+## 7. Feature Engineering Insights
+
+From LightGBM feature importances on the log-target model:
+
+**Top features** (by split count):
+
+- `M, N, K` -- raw dimensions are always the most important
+- `tile_m, tile_n, tile_k` -- the tile shape is the primary kernel differentiator
+- `overall_tile_efficiency` -- how well the shape fits the tile (the interaction)
+- `num_tiles_m, total_output_tiles` -- work decomposition
+- `arithmetic_intensity` -- compute vs memory bound regime
+- `pipeline` -- pipeline type (compv3 vs compv4 vs mem) significantly affects perf
+
+**Low-importance features**:
+
+- Hardware constants (CUs, clock, caches) -- they're constant within one arch
+model, so they provide no discriminative signal. They'll become important when
+training cross-arch models.
+- `split_k` -- always 1 in current data
+- `persistent` -- rarely True in current kernel set
+
+## 8. Warm-Start Works for Incremental Updates
+
+LightGBM's `init_model` parameter successfully continues training from an
+existing model. New trees are added on top of existing ones. Key considerations:
+
+- Feature schema must match exactly (enforced by `check_feature_compatibility`)
+- Use fewer new trees (200-500) since we're refining, not starting fresh
+- The `train_manifest.json` tracks the full lineage (total trees, data sizes)
+- Quality should be at least as good as the base model (tested)
+
+## 9. Data Volume Matters More Than Model Complexity
+
+
+| Dataset                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | Shapes | Rows | Mean Eff (log, 500 trees)     |
+| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------ | ---- | ----------------------------- |
+| Original (DeepSeek only)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | 108    | 418K | 98.28% (on seen distribution) |
+| + Wide coverage M=1 distribution. Adding 60 diverse shapes (many M=1) exposed the model's weakness on tiny shapes. More diverse training data is always better than a bigger model on narrow data.Summary of DefaultsBased on these findings, the current defaults in `train.py` are:- **Target transform**: `log1p` for TFLOPS and bandwidth (scale normalization)- **Model**: 2000 trees, 255 leaves, max depth 15, LR 0.02- **Validation**: 5-fold GroupKFold, key = (M, N, K)- **Early stopping**: patience 100 (let trees fully converge)- **Warm start**: 500 new trees (was 200, increased for bigger base model) | 168    | 626K | 96.92% (harder distribution)  |
+
+
+The original 108-shape model looked great (98.28%) but was overfitting to the
+DeepSeek LLM inference
+
diff --git a/dispatcher/heuristics/README.md b/dispatcher/heuristics/README.md
new file mode 100644
index 0000000000..91b07466b6
--- /dev/null
+++ b/dispatcher/heuristics/README.md
@@ -0,0 +1,271 @@
+# CK Tile Heuristics: ML-Based Kernel Selection
+
+Fast, accurate kernel selection for CK Tile operations using LightGBM regression
+with Origami-augmented feature engineering.
+
+## What This Does
+
+Instead of running all 4608+ kernel configurations on the GPU to find the best
+one (exhaustive search taking ~46 seconds per shape), this system trains an ML
+model that predicts TFLOPS for any (problem, kernel) pair in microseconds. It
+scores all candidates instantly and picks the best kernel -- achieving 98.28%
+of oracle-best TFLOPS efficiency across 108 tested shapes.
+
+## Quick Start
+
+### 1. Generate and convert benchmark data
+
+**Step 1: Generate benchmark data**
+
+```bash
+python3 generate_benchmark_data.py \
+    --build_dir /path/to/build \
+    --output_dir data/fp16_original \
+    --dtype fp16 \
+    --layout rcr \
+    --num_build_jobs 4 \
+    --warmup 10 \
+    --repeat 50
+```
+
+This outputs JSON with all benchmark results.
+
+**Step 2: Convert JSON to parquet training format**
+
+```bash
+python3 convert_json_to_parquet.py \
+    --input data/fp16_original/benchmark_results_fp16_rcr.json \
+    --output data/fp16_original/fp16_training_data.parquet \
+    --arch gfx950
+```
+
+The converter automatically fixes pad flags for `_mem` kernels and validates data.
+
+**Alternative: Parse existing logs**
+
+If you have raw benchmark logs from CK Tile:
+
+```bash
+python3 data_pipeline.py ck_tile_testrun_2.log \
+    -o data/gemm_universal_fp8_rcr_gfx950.parquet \
+    --arch gfx950 --capture_hw
+```
+
+### 2. Train a model
+
+```bash
+python3 train.py \
+    --data_dir data/ \
+    --out_dir models/gemm_universal_fp8_gfx950 \
+    --op gemm_universal --dtype fp8 --arch gfx950
+```
+
+**Note**: Trained models are automatically compressed to `.lgbm.gz` format to save space (~67% reduction). The Python tools automatically decompress them on first use and cache the decompressed version. For warm-start training, decompression happens automatically.
+
+### 3. Evaluate
+
+```bash
+python3 evaluate.py \
+    --model_dir models/gemm_universal_fp8_gfx950 \
+    --data_dir data/ --op gemm_universal --dtype fp8
+```
+
+### 4. Predict the best kernel for a problem
+
+```bash
+python3 predict.py \
+    --model_dir models/gemm_universal_fp8_gfx950 \
+    --m 128 --n 1536 --k 7168 --layout rcr
+```
+
+### 5. Search for optimal configs (optional)
+
+```bash
+python3 search.py \
+    --model_dir models/gemm_universal_fp8_gfx950 \
+    --m 128 --n 1536 --k 7168 \
+    --strategy random --budget 500 --top_k 10
+```
+
+### 6. Using models in C++ (requires decompression)
+
+C++ code uses the LightGBM C API which requires uncompressed `.lgbm` files. If you have compressed models (`.lgbm.gz`), decompress them first:
+
+```bash
+cd models/gemm_universal_fp16_gfx950
+gunzip model_tflops.lgbm.gz
+```
+
+Then use in C++ examples:
+
+```bash
+cd dispatcher/build
+./gemm_09_ml_heuristic --model ../heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm
+```
+
+**Note**: Python tools automatically decompress `.lgbm.gz` files on first use, so you can run Python scripts first to trigger decompression, then use the same models in C++.
+
+## Architecture
+
+```
+Problem (M, N, K, dtype, layout)
+    |
+    v
+FeatureEngine.extract_batch()    <-- 55 features: problem, kernel, interaction, hardware
+    |
+    v
+LGBMRegressor.predict()          <-- predicts TFLOPS for each candidate kernel
+    |
+    v
+Sort by predicted TFLOPS          <-- rank all candidates
+    |
+    v
+Select Top-1 kernel               <-- 98.28% mean efficiency, <1ms inference
+```
+
+Three models are trained per (op, dtype, arch):
+- **TFLOPS model** (primary): used for kernel ranking
+- **Latency model** (auxiliary): for latency-sensitive workloads
+- **Bandwidth model** (auxiliary): for memory-bound analysis
+
+## File Inventory
+
+| File | Purpose |
+|---|---|
+| `generate_benchmark_data.py` | Build and run benchmarks across ~25 diverse problem sizes, output JSON |
+| `convert_json_to_parquet.py` | Convert benchmark JSON to parquet training format, fix `_mem` pad flags |
+| `data_pipeline.py` | Parse raw benchmark logs into canonical parquet datasets |
+| `feature_engine.py` | 55-feature extraction: problem, kernel, interaction, hardware profile |
+| `train.py` | Multi-target LGBMRegressor training with GroupKFold CV, IHEM, warm-start |
+| `predict.py` | Predictor class: predict TFLOPS/latency/bandwidth, rank kernels |
+| `evaluate.py` | Full evaluation: global metrics, per-shape/layout/pipeline slices |
+| `search.py` | Surrogate search: discrete DE, random top-K |
+| `generate_wide_coverage.py` | Generate benchmark data across 706 diverse shapes |
+| `generate_edge_dims.py` | Generate N=1, K=1, and other edge-case shapes |
+| `DATA_GENERATION.md` | Detailed guide for building binaries and generating data |
+| `plan.md` | Full design plan with architecture, milestones, and rationale |
+
+## Features Used (55 total)
+
+### Problem features (13)
+`M, N, K, split_k, log2(M), log2(N), log2(K), log2(MNK),
+arithmetic_intensity, aspect_ratio_mn, aspect_ratio_mk, aspect_ratio_nk, layout`
+
+### Kernel features (17)
+`tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n,
+warp_tile_k, pipeline, scheduler, epilogue, pad_m, pad_n, pad_k, persistent,
+num_warps, tile_volume, tile_mn, lds_usage_estimate, lds_usage_ratio`
+
+### Interaction features (9)
+`num_tiles_m, num_tiles_n, num_tiles_k, total_output_tiles,
+tile_eff_m, tile_eff_n, tile_eff_k, overall_tile_efficiency, cu_utilization`
+
+### Hardware profile features (12)
+`hw_num_cus, hw_simds_per_cu, hw_total_simds, hw_shader_engines,
+hw_max_clock_mhz, hw_max_waves_per_cu, hw_wavefront_size, hw_lds_capacity,
+hw_l1_cache_kb, hw_l2_cache_kb, hw_l3_cache_kb, hw_num_xcd`
+
+## Model Performance
+
+### fp8 RCR, gfx950
+
+| Metric | 108 shapes (original) | 168 shapes (wide coverage) |
+|---|---|---|
+| Mean TFLOPS Efficiency | 98.28% | 97.51% |
+| P10 TFLOPS Efficiency | 94.64% | 93.89% |
+| tiny_m (M=1) Efficiency | 95.57% | 96.04% |
+| R2 (TFLOPS) | 0.997 | 0.993 |
+
+### fp16 RCR, gfx950
+
+Trained on 25 shapes, 1,024 kernels, 21,920 valid benchmarks.
+
+| Metric | Value |
+|---|---|
+| Mean TFLOPS Efficiency | 99.36% |
+| P10 TFLOPS Efficiency | 98.05% |
+| P50 TFLOPS Efficiency | 100.00% |
+| Min Efficiency | 95.45% |
+| NDCG@1 | 64.00% |
+| Top-5 Hit Rate | 88.00% |
+
+**Shape Family Breakdown:**
+
+| Shape Family | Mean Eff | P10 Eff | Shapes |
+|---|---|---|---|
+| Large M (M≥1024) | 99.54% | 99.07% | 4 |
+| Medium M (128≤M<1024) | 99.62% | 98.74% | 7 |
+| Small M (8≤M<128) | 98.82% | 96.22% | 8 |
+| Tiny M (M<8) | 99.65% | 98.96% | 6 |
+
+**Pipeline Breakdown:**
+
+| Pipeline | Mean Eff | P10 Eff |
+|---|---|---|
+| compv3 | 99.75% | 99.09% |
+| compv4 | 99.40% | 98.54% |
+| mem | 99.08% | 96.59% |
+
+Training uses `log1p(TFLOPS)` as the target by default, which normalizes the
+scale across shapes spanning 0.02 to 2230 TFLOPS. This was the key finding
+that improved tiny-M shapes from 84% to 96% efficiency. See
+[LEARNINGS.md](LEARNINGS.md) for details.
+
+## Validation
+
+Training uses `GroupKFold(n_splits=5)` with group key `(M, N, K)` to ensure
+the model is evaluated on shapes it has never seen during training. Layout is
+excluded from the group key to force cross-layout generalization.
+
+## Incremental Training (Warm Start)
+
+When new benchmark data arrives, update the model without retraining from scratch:
+
+```bash
+python3 train.py \
+    --data_dir data/ \
+    --out_dir models/v2 \
+    --warm_start models/gemm_universal_fp8_gfx950 \
+    --warm_start_n_estimators 200
+```
+
+This adds 200 new trees on top of the existing model. Feature schemas must
+match exactly (automatically enforced).
+
+## Extending to New Ops
+
+Adding support for a new operation (e.g., `gemm_streamk`, `grouped_conv`):
+
+1. **Build binaries**: `ninja -C build benchmark_gemm_streamk_fp8_rcr`
+2. **Subclass `FeatureEngine`**: add op-specific features (e.g., StreamK split factor)
+3. **Generate data**: run benchmarks across diverse shapes
+4. **Train**: `python3 train.py --op gemm_streamk --dtype fp8 --data_dir data/ --out_dir models/`
+
+The training, evaluation, prediction, and search infrastructure is fully
+op-agnostic -- only the feature engine needs a new subclass.
+
+## Tests
+
+102 tests covering all modules:
+
+```bash
+python3 -m pytest tests/ -v
+```
+
+Test coverage includes:
+- Log parsing with malformed JSON, empty logs, single-kernel shapes
+- Feature formula correctness (tile efficiency, LDS usage, arithmetic intensity)
+- Corner-case shapes: M=1, N=1, K=1, prime dimensions, 20480x7168x256
+- Batch vs single extraction parity
+- Parameter space validation and projection
+- Predictor: single/batch prediction, ranking, missing models, empty inputs
+- Training: group keys, efficiency computation, warm-start, feature compatibility
+- Search: random, DE, config validity, determinism
+
+## Documentation
+
+- **[README.md](README.md)**: This file -- quick start, architecture, performance
+- **[DATA_GENERATION.md](DATA_GENERATION.md)**: Complete guide for building tile engine
+  binaries, running benchmarks, managing datasets, and troubleshooting
+- **[LEARNINGS.md](LEARNINGS.md)**: Empirical findings and design decisions (log-transform,
+  IHEM results, tiny-M analysis, feature importance, N=1/K=1 edge cases)
diff --git a/dispatcher/heuristics/__init__.py b/dispatcher/heuristics/__init__.py
new file mode 100644
index 0000000000..e208c91163
--- /dev/null
+++ b/dispatcher/heuristics/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# CK Tile Heuristics: ML-based kernel selection
diff --git a/dispatcher/heuristics/collect_additional.sh b/dispatcher/heuristics/collect_additional.sh
new file mode 100755
index 0000000000..d963b1483a
--- /dev/null
+++ b/dispatcher/heuristics/collect_additional.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# Generate additional benchmark data for shapes NOT in the original log.
+# Runs in background; outputs streaming JSON that can be parsed by data_pipeline.py.
+
+BIN_DIR="/workspace/ck_tile/bin"
+OUT_LOG="data/additional_shapes.log"
+WARMUP=3
+REPEAT=10
+
+mkdir -p data
+
+# Additional shapes: square powers-of-2 and common ML sizes not in original DeepSeek set
+SHAPES=(
+    "64,64,64"
+    "128,128,128"
+    "256,256,256"
+    "512,512,512"
+    "1024,1024,1024"
+    "2048,2048,2048"
+    "4096,4096,4096"
+    "1,4096,4096"
+    "8,4096,4096"
+    "32,4096,4096"
+    "128,4096,4096"
+    "1,4096,11008"
+    "32,4096,11008"
+    "1,8192,8192"
+    "32,8192,8192"
+    "1,8192,28672"
+    "32,8192,28672"
+    "256,256,8192"
+    "8192,8192,256"
+    "1024,4096,1024"
+    "4096,1024,4096"
+    "2048,8192,2048"
+)
+
+echo "CK Tile Additional Shapes Benchmark" > "$OUT_LOG"
+echo "GPU ID: 0" >> "$OUT_LOG"
+echo "Implementation: gemm_universal" >> "$OUT_LOG"
+echo "" >> "$OUT_LOG"
+
+SHAPE_IDX=0
+for SHAPE in "${SHAPES[@]}"; do
+    IFS=',' read -r M N K <<< "$SHAPE"
+    SHAPE_IDX=$((SHAPE_IDX + 1))
+
+    echo "========================================" >> "$OUT_LOG"
+    echo "Shape $SHAPE_IDX: M=$M N=$N K=$K dtype=fp8 layout=rcr" >> "$OUT_LOG"
+    echo "========================================" >> "$OUT_LOG"
+
+    KERNEL_COUNT=0
+    for EXE in "$BIN_DIR"/benchmark_gemm_universal_fp8_rcr_*; do
+        KERNEL_COUNT=$((KERNEL_COUNT + 1))
+        OUTPUT=$("$EXE" -m="$M" -n="$N" -k="$K" -warmup=$WARMUP -repeat=$REPEAT -verify=0 2>/dev/null)
+        # Extract just the JSON block
+        echo "$OUTPUT" | sed -n '/{/,/^}/p' >> "$OUT_LOG"
+    done
+
+    echo "Found $KERNEL_COUNT kernels" >> "$OUT_LOG"
+    echo "Completed shape $SHAPE_IDX: M=$M N=$N K=$K ($KERNEL_COUNT kernels)" >&2
+done
+
+echo "Done generating additional data" >&2
diff --git a/dispatcher/heuristics/convert_json_to_parquet.py b/dispatcher/heuristics/convert_json_to_parquet.py
new file mode 100644
index 0000000000..4cfd667c76
--- /dev/null
+++ b/dispatcher/heuristics/convert_json_to_parquet.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Convert benchmark JSON results to parquet format for training.
+
+Usage:
+    python convert_json_to_parquet.py \
+        --input benchmark_results_fp16_rcr.json \
+        --output fp16_training_data.parquet
+
+Features:
+    - Converts JSON benchmark results to flat row format
+    - Automatically fixes pad flags for _mem kernels
+    - Captures both successes and failures
+    - Compatible with existing training data format
+"""
+
+import argparse
+import json
+import pandas as pd
+from pathlib import Path
+
+
+def convert_json_to_parquet(json_file: Path, output_file: Path, arch: str = "gfx950"):
+    """Convert benchmark JSON to parquet training data format."""
+
+    print(f"Loading {json_file}...")
+    with open(json_file) as f:
+        data = json.load(f)
+
+    metadata = data.get("metadata", {})
+    dtype = metadata.get("dtype", "fp16")
+    layout = metadata.get("layout", "rcr")
+
+    print(f"  Data type: {dtype}")
+    print(f"  Layout: {layout}")
+    print(f"  Kernels: {metadata.get('num_kernels', 0)}")
+    print(f"  Problem sizes: {metadata.get('num_problems', 0)}")
+    print()
+
+    rows = []
+    for kernel_result in data["results"]:
+        kernel_config = kernel_result["kernel_config"]
+
+        for benchmark in kernel_result["benchmarks"]:
+            # Common fields for both valid and invalid runs
+            row = {
+                "op_type": "gemm_universal",
+                "dtype": dtype,
+                "layout": layout,
+                "arch": arch,
+                "kernel_name": kernel_config["name"],
+                "m": benchmark["m"],
+                "n": benchmark["n"],
+                "k": benchmark["k"],
+                "split_k": 1,
+                "is_valid": benchmark["is_valid"],
+                "run_id": 0,
+                "pipeline": kernel_config["pipeline"],
+                "epilogue": kernel_config["epilogue"],
+                "scheduler": kernel_config["scheduler"],
+                "pad_m": kernel_config["pad_m"],
+                "pad_n": kernel_config["pad_n"],
+                "pad_k": kernel_config["pad_k"],
+                "persistent": kernel_config["persistent"],
+                "tile_m": kernel_config["tile_m"],
+                "tile_n": kernel_config["tile_n"],
+                "tile_k": kernel_config["tile_k"],
+                "warp_m": kernel_config["warp_m"],
+                "warp_n": kernel_config["warp_n"],
+                "warp_k": kernel_config["warp_k"],
+                "warp_tile_m": kernel_config["warp_tile_m"],
+                "warp_tile_n": kernel_config["warp_tile_n"],
+                "warp_tile_k": kernel_config["warp_tile_k"],
+            }
+
+            if benchmark["is_valid"]:
+                # Valid run - include performance metrics
+                row["measured_tflops"] = benchmark["tflops"]
+                row["latency_ms"] = benchmark["avg_time_ms"]
+                # Calculate bandwidth if needed
+                m, n, k = benchmark["m"], benchmark["n"], benchmark["k"]
+                bytes_transferred = (m * k + k * n + m * n) * 2  # FP16 = 2 bytes
+                if benchmark["avg_time_ms"] > 0:
+                    row["bandwidth_gb_s"] = (bytes_transferred / 1e9) / (
+                        benchmark["avg_time_ms"] / 1000
+                    )
+                else:
+                    row["bandwidth_gb_s"] = 0.0
+            else:
+                # Failed run - zero metrics
+                row["measured_tflops"] = 0.0
+                row["latency_ms"] = 0.0
+                row["bandwidth_gb_s"] = 0.0
+
+            rows.append(row)
+
+    df = pd.DataFrame(rows)
+
+    print(f"Converted {len(df):,} benchmark results")
+    print(f"  Valid: {df['is_valid'].sum():,}")
+    print(f"  Failed: {(~df['is_valid']).sum():,}")
+    print()
+
+    # Fix pad flags for _mem kernels (critical for P1 features!)
+    print("Fixing pad flags for _mem kernels...")
+    mem_mask = df["pipeline"] == "mem"
+    mem_count = mem_mask.sum()
+
+    if mem_count > 0:
+        df.loc[mem_mask, "pad_m"] = True
+        df.loc[mem_mask, "pad_n"] = True
+        df.loc[mem_mask, "pad_k"] = True
+        print(f"  ✓ Fixed {mem_count:,} _mem kernel rows")
+        print()
+
+    # Save to parquet
+    df.to_parquet(output_file, index=False)
+    print(f"✓ Saved to {output_file}")
+    print()
+
+    # Show statistics
+    print("=" * 80)
+    print("STATISTICS")
+    print("=" * 80)
+    print()
+
+    print("Dimension ranges:")
+    print(f"  M: {df['m'].min():,} - {df['m'].max():,}")
+    print(f"  N: {df['n'].min():,} - {df['n'].max():,}")
+    print(f"  K: {df['k'].min():,} - {df['k'].max():,}")
+    print()
+
+    print("Pipeline distribution:")
+    print(df["pipeline"].value_counts())
+    print()
+
+    print("Pad flag distribution:")
+    pad_combos = df[["pad_m", "pad_n", "pad_k"]].value_counts()
+    print(pad_combos)
+    print()
+
+    if (~df["is_valid"]).sum() > 0:
+        print("Failure analysis:")
+        failed = df[~df["is_valid"]]
+        print(f"  Total failures: {len(failed):,}")
+
+        # Group by pipeline
+        print("\n  By pipeline:")
+        for pipeline, count in failed["pipeline"].value_counts().items():
+            print(f"    {pipeline}: {count:,}")
+
+        # Show sample failures
+        print("\n  Sample failures:")
+        for _, row in failed.head(5).iterrows():
+            print(
+                f"    {row['kernel_name'][:60]:60s} M={row['m']:4d} N={row['n']:4d} K={row['k']:4d}"
+            )
+
+    return df
+
+
+def merge_datasets(parquet_files: list[Path], output_file: Path):
+    """Merge multiple parquet files into one."""
+
+    print("=" * 80)
+    print("MERGING DATASETS")
+    print("=" * 80)
+    print()
+
+    dfs = []
+    for pq_file in parquet_files:
+        if pq_file.exists():
+            df = pd.read_parquet(pq_file)
+            print(f"  {pq_file.name}: {len(df):,} rows")
+            dfs.append(df)
+        else:
+            print(f"  ✗ {pq_file} not found, skipping")
+
+    if not dfs:
+        print("No files to merge!")
+        return
+
+    combined = pd.concat(dfs, ignore_index=True)
+    combined.to_parquet(output_file, index=False)
+
+    print()
+    print(f"✓ Merged {len(combined):,} total rows to {output_file}")
+    print()
+
+    # Show dtype distribution
+    print("Data type distribution:")
+    print(combined["dtype"].value_counts())
+    print()
+
+    print("Layout distribution:")
+    print(combined["layout"].value_counts())
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert benchmark JSON to parquet training data",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--input", type=str, required=True, help="Input JSON file from benchmark"
+    )
+    parser.add_argument("--output", type=str, required=True, help="Output parquet file")
+    parser.add_argument("--arch", type=str, default="gfx950", help="GPU architecture")
+    parser.add_argument(
+        "--merge_with", type=str, nargs="*", help="Additional parquet files to merge"
+    )
+
+    args = parser.parse_args()
+
+    input_file = Path(args.input)
+    output_file = Path(args.output)
+
+    # Convert JSON to parquet
+    df = convert_json_to_parquet(input_file, output_file, args.arch)
+
+    # Merge if requested
+    if args.merge_with:
+        merge_files = [output_file] + [Path(f) for f in args.merge_with]
+        merged_output = output_file.parent / f"{output_file.stem}_merged.parquet"
+        merge_datasets(merge_files, merged_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/data_pipeline.py b/dispatcher/heuristics/data_pipeline.py
new file mode 100644
index 0000000000..c3f5f9ced7
--- /dev/null
+++ b/dispatcher/heuristics/data_pipeline.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Data pipeline for CK Tile heuristics.
+
+Parses benchmark logs and structured JSON into a canonical parquet dataset.
+Supports:
+  - Streaming log format (Shape N: headers + inline JSON) from ck_tile profiling runs
+  - Structured JSON from generate_benchmark_data.py
+  - Direct parquet passthrough
+"""
+
+import json
+import re
+import subprocess
+import hashlib
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+
+
+CANONICAL_COLUMNS = [
+    "op_type",
+    "dtype",
+    "layout",
+    "arch",
+    "kernel_name",
+    "m",
+    "n",
+    "k",
+    "split_k",
+    "measured_tflops",
+    "latency_ms",
+    "bandwidth_gb_s",
+    "is_valid",
+    "tile_m",
+    "tile_n",
+    "tile_k",
+    "warp_m",
+    "warp_n",
+    "warp_k",
+    "warp_tile_m",
+    "warp_tile_n",
+    "warp_tile_k",
+    "pipeline",
+    "scheduler",
+    "epilogue",
+    "pad_m",
+    "pad_n",
+    "pad_k",
+    "persistent",
+    "run_id",
+]
+
+
+def parse_kernel_name(name: str) -> dict:
+    """Extract kernel config fields from a gemm_universal kernel name.
+
+    Name format:
+      gemm_universal_{dtype}_{layout}_{pipeline}_{epilogue}_{scheduler}
+      _{padM}_{padN}_{padK}_{persistent}_{tileM}x{tileN}x{tileK}
+      _{warpM}x{warpN}x{warpK}_{warpTileM}x{warpTileN}x{warpTileK}
+    """
+    result = {}
+    try:
+        prefix_match = re.match(
+            r"gemm_universal_(\w+?)_((?:rcr|rrr|crr|ccr))_(.*)", name
+        )
+        if not prefix_match:
+            return result
+        result["dtype"] = prefix_match.group(1)
+        result["layout"] = prefix_match.group(2)
+        remainder = prefix_match.group(3)
+
+        parts = remainder.split("_")
+        if len(parts) < 10:
+            return result
+
+        result["pipeline"] = parts[0]
+        result["epilogue"] = parts[1]
+        result["scheduler"] = parts[2]
+        result["pad_m"] = parts[3] == "True"
+        result["pad_n"] = parts[4] == "True"
+        result["pad_k"] = parts[5] == "True"
+        result["persistent"] = parts[6] == "True"
+
+        tile_dims = parts[7].split("x")
+        warp_dims = parts[8].split("x")
+        warp_tile_dims = parts[9].split("x")
+
+        result["tile_m"] = int(tile_dims[0])
+        result["tile_n"] = int(tile_dims[1])
+        result["tile_k"] = int(tile_dims[2])
+        result["warp_m"] = int(warp_dims[0])
+        result["warp_n"] = int(warp_dims[1])
+        result["warp_k"] = int(warp_dims[2])
+        result["warp_tile_m"] = int(warp_tile_dims[0])
+        result["warp_tile_n"] = int(warp_tile_dims[1])
+        result["warp_tile_k"] = int(warp_tile_dims[2])
+    except (IndexError, ValueError):
+        pass
+    return result
+
+
+def _layout_from_problem(problem: dict) -> str:
+    """Derive layout shorthand (rcr/rrr/etc.) from problem JSON fields."""
+    la = problem.get("layout_a", "")
+    lb = problem.get("layout_b", "")
+    lc = problem.get("layout_c", "")
+
+    def _tag(s):
+        s = s.lower()
+        if "row" in s:
+            return "r"
+        if "col" in s:
+            return "c"
+        return "?"
+
+    return _tag(la) + _tag(lb) + _tag(lc)
+
+
+def parse_streaming_log(
+    path: str | Path,
+    arch: str = "unknown",
+    run_id: Optional[str] = None,
+    op_type: str = "gemm_universal",
+) -> pd.DataFrame:
+    """Parse a CK Tile streaming benchmark log into a canonical DataFrame.
+
+    The log alternates between shape headers and JSON result blocks:
+        Shape N: M=16 N=1536 K=7168 dtype=fp8 layout=rcr
+        {
+          "name": "gemm_universal_...",
+          "problem": { ... },
+          "perf_result": { "latency(ms)": ..., "tflops(TFlops)": ..., "bandwidth(GB/s)": ... }
+        }
+    """
+    path = Path(path)
+    if run_id is None:
+        run_id = hashlib.md5(path.name.encode()).hexdigest()[:12]
+
+    shape_re = re.compile(
+        r"Shape\s+\d+:\s+M=(\d+)\s+N=(\d+)\s+K=(\d+)\s+dtype=(\w+)\s+layout=(\w+)"
+    )
+
+    rows = []
+    current_m, current_n, current_k = 0, 0, 0
+    current_dtype, current_layout = "", ""
+    json_buf = []
+    brace_depth = 0
+
+    with open(path, "r") as f:
+        for line in f:
+            stripped = line.strip()
+
+            shape_match = shape_re.search(stripped)
+            if shape_match:
+                current_m = int(shape_match.group(1))
+                current_n = int(shape_match.group(2))
+                current_k = int(shape_match.group(3))
+                current_dtype = shape_match.group(4)
+                current_layout = shape_match.group(5)
+                continue
+
+            if brace_depth == 0 and stripped.startswith("{"):
+                json_buf = [stripped]
+                brace_depth = stripped.count("{") - stripped.count("}")
+                if brace_depth == 0:
+                    raw = "\n".join(json_buf)
+                    try:
+                        obj = json.loads(raw)
+                    except json.JSONDecodeError:
+                        continue
+                else:
+                    continue
+            elif brace_depth > 0:
+                json_buf.append(stripped)
+                brace_depth += stripped.count("{") - stripped.count("}")
+                if brace_depth <= 0:
+                    brace_depth = 0
+                    raw = "\n".join(json_buf)
+                    try:
+                        obj = json.loads(raw)
+                    except json.JSONDecodeError:
+                        continue
+                else:
+                    continue
+            else:
+                continue
+
+            # If we get here, obj was successfully parsed
+            kernel_name = obj.get("name", "")
+            problem = obj.get("problem", {})
+            perf = obj.get("perf_result", {})
+
+            m = problem.get("m", current_m)
+            n = problem.get("n", current_n)
+            k = problem.get("k", current_k)
+            split_k = problem.get("split_k", 1)
+            dtype = problem.get("dtype_a", current_dtype)
+            layout = (
+                _layout_from_problem(problem)
+                if problem.get("layout_a")
+                else current_layout
+            )
+
+            tflops = perf.get("tflops(TFlops)", 0.0)
+            latency = perf.get("latency(ms)", 0.0)
+            bandwidth = perf.get("bandwidth(GB/s)", 0.0)
+
+            kp = parse_kernel_name(kernel_name)
+
+            row = {
+                "op_type": op_type,
+                "dtype": dtype,
+                "layout": layout,
+                "arch": arch,
+                "kernel_name": kernel_name,
+                "m": m,
+                "n": n,
+                "k": k,
+                "split_k": split_k,
+                "measured_tflops": tflops,
+                "latency_ms": latency,
+                "bandwidth_gb_s": bandwidth,
+                "is_valid": tflops > 0 and latency > 0,
+                "run_id": run_id,
+            }
+            row.update(kp)
+            rows.append(row)
+
+    df = pd.DataFrame(rows)
+    for col in CANONICAL_COLUMNS:
+        if col not in df.columns:
+            df[col] = None
+    return df
+
+
+def get_hardware_profile() -> dict:
+    """Capture GPU hardware profile from rocminfo."""
+    profile = {}
+    try:
+        result = subprocess.run(
+            ["rocminfo"], capture_output=True, text=True, timeout=30
+        )
+        output = result.stdout
+
+        gpu_section = False
+        for line in output.split("\n"):
+            line = line.strip()
+            if "Device Type:" in line and "GPU" in line:
+                gpu_section = True
+                continue
+            if gpu_section and "Device Type:" in line and "GPU" not in line:
+                break
+            if not gpu_section:
+                continue
+
+            if ":" not in line:
+                continue
+            key, _, val = line.partition(":")
+            key = key.strip()
+            val = val.strip()
+
+            mapping = {
+                "Name": "gfx_name",
+                "Marketing Name": "marketing_name",
+                "Compute Unit": "num_cus",
+                "SIMDs per CU": "simds_per_cu",
+                "Shader Engines": "shader_engines",
+                "Shader Arrs. per Eng.": "shader_arrays_per_engine",
+                "Max Clock Freq. (MHz)": "max_clock_mhz",
+                "Wavefront Size": "wavefront_size",
+                "Max Waves Per CU": "max_waves_per_cu",
+                "Chip ID": "chip_id",
+            }
+
+            if key in mapping:
+                raw = val.split("(")[0].strip()
+                try:
+                    profile[mapping[key]] = int(raw)
+                except ValueError:
+                    profile[mapping[key]] = raw
+
+        for line in output.split("\n"):
+            line = line.strip()
+            if line.startswith("L1:") and "num_cus" in profile:
+                raw = line.split(":")[1].strip().split("(")[0].strip()
+                try:
+                    profile["l1_cache_kb"] = int(raw)
+                except ValueError:
+                    pass
+            elif line.startswith("L2:"):
+                raw = line.split(":")[1].strip().split("(")[0].strip()
+                try:
+                    profile["l2_cache_kb"] = int(raw)
+                except ValueError:
+                    pass
+            elif line.startswith("L3:"):
+                raw = line.split(":")[1].strip().split("(")[0].strip()
+                try:
+                    profile["l3_cache_kb"] = int(raw)
+                except ValueError:
+                    pass
+
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        pass
+
+    return profile
+
+
+def load_parquet(path: str | Path) -> pd.DataFrame:
+    """Load a canonical parquet dataset."""
+    return pd.read_parquet(path)
+
+
+def save_parquet(df: pd.DataFrame, path: str | Path):
+    """Save a DataFrame in canonical parquet format."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(path, index=False, engine="pyarrow")
+
+
+def build_training_dataset(
+    data_dir: str | Path,
+    op_type: str = "gemm_universal",
+    dtype: str = "fp8",
+) -> pd.DataFrame:
+    """Load and merge all parquet files matching the given op/dtype from a directory."""
+    data_dir = Path(data_dir)
+    frames = []
+    for f in sorted(data_dir.glob("*.parquet")):
+        df = pd.read_parquet(f)
+        if "op_type" in df.columns:
+            df = df[df["op_type"] == op_type]
+        if "dtype" in df.columns:
+            df = df[df["dtype"] == dtype]
+        if len(df) > 0:
+            frames.append(df)
+    if not frames:
+        raise FileNotFoundError(
+            f"No parquet files with op_type={op_type}, dtype={dtype} in {data_dir}"
+        )
+    return pd.concat(frames, ignore_index=True)
+
+
+if __name__ == "__main__":
+    import argparse
+    import time
+
+    parser = argparse.ArgumentParser(description="Parse CK Tile benchmark data")
+    parser.add_argument("input", help="Input file (log or parquet)")
+    parser.add_argument("--output", "-o", required=True, help="Output parquet path")
+    parser.add_argument("--arch", default="gfx950", help="GPU architecture")
+    parser.add_argument("--op_type", default="gemm_universal", help="Operation type")
+    parser.add_argument(
+        "--capture_hw",
+        action="store_true",
+        help="Capture hardware profile from rocminfo",
+    )
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+
+    print(f"Parsing {input_path}...")
+    t0 = time.time()
+
+    if input_path.suffix == ".parquet":
+        df = load_parquet(input_path)
+    else:
+        df = parse_streaming_log(input_path, arch=args.arch, op_type=args.op_type)
+
+    elapsed = time.time() - t0
+    print(f"Parsed {len(df)} rows in {elapsed:.1f}s")
+    print(f"  Unique shapes: {df.groupby(['m', 'n', 'k']).ngroups}")
+    print(f"  Unique kernels: {df['kernel_name'].nunique()}")
+    print(f"  Valid rows: {df['is_valid'].sum()} / {len(df)}")
+
+    if df["measured_tflops"].max() > 0:
+        print(
+            f"  TFLOPS range: {df['measured_tflops'].min():.2f} - {df['measured_tflops'].max():.2f}"
+        )
+
+    if args.capture_hw:
+        hw = get_hardware_profile()
+        print(f"  Hardware profile: {hw}")
+        for k, v in hw.items():
+            df[f"hw_{k}"] = v
+
+    save_parquet(df, args.output)
+    print(f"Saved to {args.output}")
diff --git a/dispatcher/heuristics/dispatcher_integration.py b/dispatcher/heuristics/dispatcher_integration.py
new file mode 100644
index 0000000000..c449c1e816
--- /dev/null
+++ b/dispatcher/heuristics/dispatcher_integration.py
@@ -0,0 +1,324 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Dispatcher integration for ML-based kernel selection.
+
+Bridges the trained LightGBM Predictor with the CK Tile dispatcher's
+kernel selection flow. Provides heuristic functions compatible with
+both the Python pre-selection pattern (08_heuristics.py style) and
+the C++ HeuristicFunction signature.
+
+Name mapping between feature engine and dispatcher KernelConfig:
+    Feature engine          Dispatcher KernelConfig
+    ---------------------   ----------------------
+    warp_m (warps/block)    wave_m
+    warp_n                  wave_n
+    warp_k                  wave_k
+    warp_tile_m             warp_m
+    warp_tile_n             warp_n
+    warp_tile_k             warp_k
+
+Usage:
+    from dispatcher_integration import create_ml_heuristic
+
+    heuristic = create_ml_heuristic("models/gemm_universal_fp8_gfx950")
+    best_spec = heuristic(M=1024, N=1024, K=1024, kernel_pool=KERNEL_POOL)
+"""
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+
+from data_pipeline import parse_kernel_name
+from predict import Predictor
+
+
+LAYOUT_TO_DISPATCHER = {
+    "rcr": ("row", "col", "row"),
+    "rrr": ("row", "row", "row"),
+    "crr": ("col", "row", "row"),
+    "ccr": ("col", "col", "row"),
+}
+
+DTYPE_TO_C_DTYPE = {
+    "fp8": "fp16",
+    "fp16": "fp16",
+    "bf16": "bf16",
+    "fp32": "fp32",
+}
+
+
+@dataclass
+class MLKernelSpec:
+    """Kernel spec returned by the ML heuristic, compatible with the dispatcher
+    example pattern. Carries both the feature-engine-space config and the
+    dispatcher-space KernelConfig fields."""
+
+    kernel_name: str
+    predicted_tflops: float
+
+    tile_m: int
+    tile_n: int
+    tile_k: int
+    wave_m: int
+    wave_n: int
+    wave_k: int
+    warp_m: int
+    warp_n: int
+    warp_k: int
+    pipeline: str
+    scheduler: str
+    epilogue: str
+    pad_m: bool
+    pad_n: bool
+    pad_k: bool
+    persistent: bool
+
+
+def kernel_config_to_feature_dict(kernel_name: str) -> dict:
+    """Parse a tile-engine kernel name into a feature-engine-compatible dict.
+
+    Returns a dict with fields matching what GemmUniversalFeatureEngine.extract()
+    expects for the kernel parameter: tile_m/n/k, warp_m/n/k (warps per block),
+    warp_tile_m/n/k, pipeline, scheduler, epilogue, pad_m/n/k, persistent.
+    """
+    parsed = parse_kernel_name(kernel_name)
+    if not parsed:
+        return {}
+    parsed["kernel_name"] = kernel_name
+    return parsed
+
+
+def feature_dict_to_dispatcher_config(
+    feat: dict, dtype: str = "fp8", arch: str = "gfx950"
+) -> dict:
+    """Convert a feature-engine kernel dict to dispatcher KernelConfig fields.
+
+    Handles the naming inversion:
+        feature engine warp_m   -> KernelConfig wave_m  (warps per block)
+        feature engine warp_tile_m -> KernelConfig warp_m (elements per warp)
+    """
+    layout = feat.get("layout", "rcr")
+    la, lb, lc = LAYOUT_TO_DISPATCHER.get(layout, ("row", "col", "row"))
+    c_dtype = DTYPE_TO_C_DTYPE.get(dtype, dtype)
+
+    return {
+        "dtype_a": dtype,
+        "dtype_b": dtype,
+        "dtype_c": c_dtype,
+        "dtype_acc": "fp32",
+        "layout_a": la,
+        "layout_b": lb,
+        "layout_c": lc,
+        "tile_m": feat.get("tile_m", 128),
+        "tile_n": feat.get("tile_n", 128),
+        "tile_k": feat.get("tile_k", 64),
+        "wave_m": feat.get("warp_m", 2),
+        "wave_n": feat.get("warp_n", 2),
+        "wave_k": feat.get("warp_k", 1),
+        "warp_m": feat.get("warp_tile_m", 32),
+        "warp_n": feat.get("warp_tile_n", 32),
+        "warp_k": feat.get("warp_tile_k", 16),
+        "pipeline": feat.get("pipeline", "compv3"),
+        "scheduler": feat.get("scheduler", "intrawave"),
+        "epilogue": feat.get("epilogue", "cshuffle"),
+        "pad_m": feat.get("pad_m", True),
+        "pad_n": feat.get("pad_n", True),
+        "pad_k": feat.get("pad_k", True),
+        "gfx_arch": arch,
+    }
+
+
+def feature_dict_to_ml_spec(feat: dict, predicted_tflops: float = 0.0) -> MLKernelSpec:
+    """Convert a feature-engine kernel dict + prediction to an MLKernelSpec."""
+    return MLKernelSpec(
+        kernel_name=feat.get("kernel_name", "unknown"),
+        predicted_tflops=predicted_tflops,
+        tile_m=feat.get("tile_m", 128),
+        tile_n=feat.get("tile_n", 128),
+        tile_k=feat.get("tile_k", 64),
+        wave_m=feat.get("warp_m", 2),
+        wave_n=feat.get("warp_n", 2),
+        wave_k=feat.get("warp_k", 1),
+        warp_m=feat.get("warp_tile_m", 32),
+        warp_n=feat.get("warp_tile_n", 32),
+        warp_k=feat.get("warp_tile_k", 16),
+        pipeline=feat.get("pipeline", "compv3"),
+        scheduler=feat.get("scheduler", "intrawave"),
+        epilogue=feat.get("epilogue", "cshuffle"),
+        pad_m=feat.get("pad_m", False),
+        pad_n=feat.get("pad_n", False),
+        pad_k=feat.get("pad_k", False),
+        persistent=feat.get("persistent", False),
+    )
+
+
+def load_kernel_pool_from_binaries(bin_dir: str | Path) -> list[dict]:
+    """Discover benchmark executables and parse their names into feature dicts.
+
+    Each executable name encodes the full kernel config. This creates the
+    candidate pool for the ML heuristic without needing a registry JSON export.
+    """
+    bin_dir = Path(bin_dir)
+    configs = []
+    for exe in sorted(bin_dir.glob("benchmark_gemm_universal_*")):
+        name = exe.stem.replace("benchmark_", "")
+        feat = kernel_config_to_feature_dict(name)
+        if feat and "tile_m" in feat:
+            configs.append(feat)
+    return configs
+
+
+def create_ml_heuristic(
+    model_dir: str | Path,
+    dtype: str = "fp8",
+    arch: str = "gfx950",
+    layout: str = "rcr",
+    kernel_pool: Optional[list[dict]] = None,
+    bin_dir: Optional[str | Path] = None,
+):
+    """Create an ML heuristic function for kernel selection.
+
+    Returns a callable with signature:
+        (M: int, N: int, K: int) -> MLKernelSpec
+
+    The returned function scores all candidate kernels using the trained
+    LightGBM regressor and returns the best one as an MLKernelSpec.
+
+    Parameters
+    ----------
+    model_dir : str or Path
+        Path to trained model directory (must contain model_tflops.lgbm or
+        model_tflops_log_big.lgbm and feature_spec.json).
+    dtype : str
+        Data type for the problem (fp8, fp16, bf16).
+    arch : str
+        GPU architecture (gfx942, gfx950).
+    layout : str
+        Matrix layout (rcr, rrr, crr, ccr).
+    kernel_pool : list of dict, optional
+        Pre-parsed kernel configs. If None, loads from bin_dir.
+    bin_dir : str or Path, optional
+        Directory with benchmark executables. Used to build kernel_pool if
+        kernel_pool is not provided. Defaults to /workspace/ck_tile/bin.
+    """
+    model_dir = Path(model_dir)
+    predictor = Predictor(model_dir)
+
+    if kernel_pool is None:
+        if bin_dir is None:
+            bin_dir = Path("/workspace/ck_tile/bin")
+        kernel_pool = load_kernel_pool_from_binaries(bin_dir)
+
+    if not kernel_pool:
+        raise ValueError(
+            "No kernel configs found. Check bin_dir or provide kernel_pool."
+        )
+
+    def heuristic(M: int, N: int, K: int) -> MLKernelSpec:
+        problem = {
+            "m": M,
+            "n": N,
+            "k": K,
+            "dtype": dtype,
+            "layout": layout,
+            "split_k": 1,
+        }
+
+        ranked = predictor.rank_kernels(problem, kernel_pool)
+
+        if not ranked:
+            feat = kernel_pool[0]
+            return feature_dict_to_ml_spec(feat, 0.0)
+
+        best_name, best_tflops = ranked[0]
+        best_feat = next(
+            (kp for kp in kernel_pool if kp.get("kernel_name") == best_name),
+            kernel_pool[0],
+        )
+        return feature_dict_to_ml_spec(best_feat, best_tflops)
+
+    return heuristic
+
+
+def create_ranked_heuristic(
+    model_dir: str | Path,
+    dtype: str = "fp8",
+    arch: str = "gfx950",
+    layout: str = "rcr",
+    kernel_pool: Optional[list[dict]] = None,
+    bin_dir: Optional[str | Path] = None,
+    top_k: int = 5,
+):
+    """Create an ML heuristic that returns the top-K ranked kernel specs.
+
+    Returns a callable with signature:
+        (M: int, N: int, K: int) -> list[MLKernelSpec]
+
+    Useful when you want fallback options if the top-1 kernel fails to build.
+    """
+    model_dir = Path(model_dir)
+    predictor = Predictor(model_dir)
+
+    if kernel_pool is None:
+        if bin_dir is None:
+            bin_dir = Path("/workspace/ck_tile/bin")
+        kernel_pool = load_kernel_pool_from_binaries(bin_dir)
+
+    name_to_feat = {kp.get("kernel_name", ""): kp for kp in kernel_pool}
+
+    def heuristic(M: int, N: int, K: int) -> list[MLKernelSpec]:
+        problem = {
+            "m": M,
+            "n": N,
+            "k": K,
+            "dtype": dtype,
+            "layout": layout,
+            "split_k": 1,
+        }
+
+        ranked = predictor.rank_kernels(problem, kernel_pool)
+        results = []
+        for name, tflops in ranked[:top_k]:
+            feat = name_to_feat.get(name, kernel_pool[0])
+            results.append(feature_dict_to_ml_spec(feat, tflops))
+        return results
+
+    return heuristic
+
+
+def ml_spec_to_dispatcher_config(
+    spec: MLKernelSpec, dtype: str = "fp8", arch: str = "gfx950"
+) -> dict:
+    """Convert an MLKernelSpec to a dict compatible with ctypes_utils.KernelConfig."""
+    layout_a, layout_b, layout_c = "row", "col", "row"
+    c_dtype = DTYPE_TO_C_DTYPE.get(dtype, dtype)
+
+    return {
+        "dtype_a": dtype,
+        "dtype_b": dtype,
+        "dtype_c": c_dtype,
+        "dtype_acc": "fp32",
+        "layout_a": layout_a,
+        "layout_b": layout_b,
+        "layout_c": layout_c,
+        "tile_m": spec.tile_m,
+        "tile_n": spec.tile_n,
+        "tile_k": spec.tile_k,
+        "wave_m": spec.wave_m,
+        "wave_n": spec.wave_n,
+        "wave_k": spec.wave_k,
+        "warp_m": spec.warp_m,
+        "warp_n": spec.warp_n,
+        "warp_k": spec.warp_k,
+        "pipeline": spec.pipeline,
+        "scheduler": spec.scheduler,
+        "epilogue": spec.epilogue,
+        "pad_m": spec.pad_m,
+        "pad_n": spec.pad_n,
+        "pad_k": spec.pad_k,
+        "gfx_arch": arch,
+    }
diff --git a/dispatcher/heuristics/evaluate.py b/dispatcher/heuristics/evaluate.py
new file mode 100644
index 0000000000..95c850aaf5
--- /dev/null
+++ b/dispatcher/heuristics/evaluate.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Evaluation and reporting for CK Tile kernel performance models.
+
+Computes:
+  - Global metrics: TFLOPS efficiency (mean, p10, p50, min), R2, NDCG@1, Top-K hit rate
+  - Per-slice breakdowns: by layout, shape family, K-depth regime, pipeline
+  - Cross-target consistency checks
+  - Feature importance analysis
+
+Usage:
+    python evaluate.py --model_dir models/gemm_universal_fp8_gfx950 --data_dir data/
+"""
+
+import argparse
+import json
+
+import numpy as np
+import pandas as pd
+
+from data_pipeline import build_training_dataset
+from feature_engine import GemmUniversalFeatureEngine
+from predict import Predictor
+from train import compute_tflops_efficiency
+
+
+def classify_shape_family(m: int, n: int, k: int) -> str:
+    """Classify a GEMM shape into a family for sliced evaluation.
+
+    Families:
+      - tiny_m: M < 32 (single-token / very small batch inference)
+      - small_m: 32 <= M < 256
+      - medium_m: 256 <= M < 4096
+      - large_m: M >= 4096
+      - square: 0.5 <= M/N <= 2.0 and 0.5 <= M/K <= 2.0
+      - tall: M/N > 2.0
+      - wide: M/N < 0.5
+    """
+    if m < 32:
+        return "tiny_m"
+    elif m < 256:
+        return "small_m"
+    elif m < 4096:
+        return "medium_m"
+    elif m >= 4096:
+        return "large_m"
+    return "other"
+
+
+def classify_k_regime(k: int) -> str:
+    """Classify K dimension into depth regime."""
+    if k < 512:
+        return "shallow_k"
+    elif k < 4096:
+        return "medium_k"
+    else:
+        return "deep_k"
+
+
+def evaluate_model(
+    predictor: Predictor,
+    df: pd.DataFrame,
+    feature_engine: GemmUniversalFeatureEngine,
+) -> dict:
+    """Run full evaluation on a dataset. Returns a metrics dictionary.
+
+    Parameters
+    ----------
+    predictor : Predictor
+        Trained predictor with at least a TFLOPS model loaded.
+    df : pd.DataFrame
+        Benchmark data in canonical schema.
+    feature_engine : GemmUniversalFeatureEngine
+        Feature engine matching the trained model.
+
+    Returns
+    -------
+    dict with keys: global_metrics, shape_family_metrics, k_regime_metrics,
+                    pipeline_metrics, per_shape_efficiency.
+    """
+    valid = df[df["is_valid"].fillna(False) & (df["measured_tflops"] > 0)].copy()
+    valid = valid.reset_index(drop=True)
+
+    X = feature_engine.extract_batch(valid)
+    model = predictor._load_model("tflops")
+    if model is None:
+        raise FileNotFoundError("No TFLOPS model found")
+
+    # Predict and apply inverse log transform if model was trained in log-space
+    raw_pred = model.predict(X)
+    if "tflops" in predictor._log_targets:
+        valid["pred_tflops"] = np.expm1(raw_pred)
+    else:
+        # Clamp to non-negative even for non-log models
+        valid["pred_tflops"] = np.maximum(0.0, raw_pred)
+
+    y_true = valid["measured_tflops"].values
+    y_pred = valid["pred_tflops"].values
+
+    ss_res = np.sum((y_true - y_pred) ** 2)
+    ss_tot = np.sum((y_true - y_true.mean()) ** 2)
+    r2 = 1 - ss_res / max(ss_tot, 1e-10)
+    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
+    mae = np.mean(np.abs(y_true - y_pred))
+
+    eff_df = compute_tflops_efficiency(valid, "pred_tflops")
+
+    ndcg1_count = 0
+    total_shapes = 0
+    topk_hits = {3: 0, 5: 0, 10: 0}
+
+    for (m, n, k), group in valid.groupby(["m", "n", "k"]):
+        if group["measured_tflops"].max() <= 0:
+            continue
+        total_shapes += 1
+        oracle_idx = group["measured_tflops"].idxmax()
+        pred_ranking = group.sort_values("pred_tflops", ascending=False).index.tolist()
+
+        if pred_ranking[0] == oracle_idx:
+            ndcg1_count += 1
+
+        oracle_rank = pred_ranking.index(oracle_idx)
+        for topk in topk_hits:
+            if oracle_rank < topk:
+                topk_hits[topk] += 1
+
+    global_metrics = {
+        "r2": r2,
+        "rmse": rmse,
+        "mae": mae,
+        "num_valid_rows": len(valid),
+        "num_shapes": total_shapes,
+        "efficiency_mean": float(eff_df["efficiency"].mean()) if len(eff_df) > 0 else 0,
+        "efficiency_p10": float(eff_df["efficiency"].quantile(0.1))
+        if len(eff_df) > 0
+        else 0,
+        "efficiency_p50": float(eff_df["efficiency"].quantile(0.5))
+        if len(eff_df) > 0
+        else 0,
+        "efficiency_min": float(eff_df["efficiency"].min()) if len(eff_df) > 0 else 0,
+        "ndcg_at_1": ndcg1_count / max(total_shapes, 1),
+        "top3_hit_rate": topk_hits[3] / max(total_shapes, 1),
+        "top5_hit_rate": topk_hits[5] / max(total_shapes, 1),
+        "top10_hit_rate": topk_hits[10] / max(total_shapes, 1),
+    }
+
+    def _slice_efficiency(slice_df):
+        if len(slice_df) == 0:
+            return {"count": 0}
+        eff = compute_tflops_efficiency(slice_df, "pred_tflops")
+        if len(eff) == 0:
+            return {"count": 0}
+        return {
+            "count": len(eff),
+            "mean": float(eff["efficiency"].mean()),
+            "p10": float(eff["efficiency"].quantile(0.1)),
+            "min": float(eff["efficiency"].min()),
+        }
+
+    valid["shape_family"] = valid.apply(
+        lambda r: classify_shape_family(r["m"], r["n"], r["k"]), axis=1
+    )
+    valid["k_regime"] = valid["k"].apply(classify_k_regime)
+
+    shape_family_metrics = {}
+    for family, group in valid.groupby("shape_family"):
+        shape_family_metrics[family] = _slice_efficiency(group)
+
+    k_regime_metrics = {}
+    for regime, group in valid.groupby("k_regime"):
+        k_regime_metrics[regime] = _slice_efficiency(group)
+
+    pipeline_metrics = {}
+    if "pipeline" in valid.columns:
+        for pipeline, group in valid.groupby("pipeline"):
+            pipeline_metrics[str(pipeline)] = _slice_efficiency(group)
+
+    return {
+        "global_metrics": global_metrics,
+        "shape_family_metrics": shape_family_metrics,
+        "k_regime_metrics": k_regime_metrics,
+        "pipeline_metrics": pipeline_metrics,
+        "per_shape_efficiency": eff_df.to_dict(orient="records")
+        if len(eff_df) > 0
+        else [],
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate CK Tile performance model")
+    parser.add_argument(
+        "--model_dir", required=True, help="Directory with trained models"
+    )
+    parser.add_argument("--data_dir", required=True, help="Directory with parquet data")
+    parser.add_argument("--op", default="gemm_universal")
+    parser.add_argument("--dtype", default="fp8")
+    parser.add_argument("--output", "-o", help="Output JSON path for metrics")
+    args = parser.parse_args()
+
+    print(f"Loading data from {args.data_dir}...")
+    df = build_training_dataset(args.data_dir, op_type=args.op, dtype=args.dtype)
+    print(f"  {len(df)} rows, {df.groupby(['m', 'n', 'k']).ngroups} shapes")
+
+    fe = GemmUniversalFeatureEngine()
+    predictor = Predictor(args.model_dir, feature_engine=fe)
+
+    print("Evaluating...")
+    results = evaluate_model(predictor, df, fe)
+
+    gm = results["global_metrics"]
+    print("\nGlobal Metrics:")
+    print(f"  R2:             {gm['r2']:.4f}")
+    print(f"  RMSE:           {gm['rmse']:.2f}")
+    print(f"  Efficiency Mean: {gm['efficiency_mean']:.4f}")
+    print(f"  Efficiency P10:  {gm['efficiency_p10']:.4f}")
+    print(f"  Efficiency P50:  {gm['efficiency_p50']:.4f}")
+    print(f"  Efficiency Min:  {gm['efficiency_min']:.4f}")
+    print(f"  NDCG@1:          {gm['ndcg_at_1']:.4f}")
+    print(f"  Top-3 Hit Rate:  {gm['top3_hit_rate']:.4f}")
+    print(f"  Top-5 Hit Rate:  {gm['top5_hit_rate']:.4f}")
+    print(f"  Top-10 Hit Rate: {gm['top10_hit_rate']:.4f}")
+
+    print("\nShape Family Breakdown:")
+    for family, metrics in sorted(results["shape_family_metrics"].items()):
+        if metrics.get("count", 0) > 0:
+            print(
+                f"  {family:12s}: mean={metrics['mean']:.4f} p10={metrics['p10']:.4f} min={metrics['min']:.4f} (n={metrics['count']})"
+            )
+
+    print("\nK-Depth Regime Breakdown:")
+    for regime, metrics in sorted(results["k_regime_metrics"].items()):
+        if metrics.get("count", 0) > 0:
+            print(
+                f"  {regime:12s}: mean={metrics['mean']:.4f} p10={metrics['p10']:.4f} min={metrics['min']:.4f} (n={metrics['count']})"
+            )
+
+    print("\nPipeline Breakdown:")
+    for pipeline, metrics in sorted(results["pipeline_metrics"].items()):
+        if metrics.get("count", 0) > 0:
+            print(
+                f"  {pipeline:15s}: mean={metrics['mean']:.4f} p10={metrics['p10']:.4f} (n={metrics['count']})"
+            )
+
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump(results, f, indent=2, default=str)
+        print(f"\nFull results saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/feature_engine.py b/dispatcher/heuristics/feature_engine.py
new file mode 100644
index 0000000000..557d9d8992
--- /dev/null
+++ b/dispatcher/heuristics/feature_engine.py
@@ -0,0 +1,577 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Feature engineering for CK Tile kernel performance prediction.
+
+Provides a strict FeatureEngine interface with per-op subclasses.
+All feature engines produce a consistent numpy array for LightGBM.
+"""
+
+import math
+from abc import ABC, abstractmethod
+
+import numpy as np
+import pandas as pd
+
+
+DTYPE_BYTES = {
+    "fp32": 4.0,
+    "fp16": 2.0,
+    "bf16": 2.0,
+    "fp8": 1.0,
+    "bf8": 1.0,
+    "int8": 1.0,
+    "int4": 0.5,
+}
+
+LAYOUT_MAP = {"rcr": 0, "rrr": 1, "crr": 2, "ccr": 3}
+PIPELINE_MAP = {"compv3": 0, "compv4": 1, "compv5": 2, "mem": 3, "preshufflev2": 4}
+SCHEDULER_MAP = {"intrawave": 0, "interwave": 1}
+EPILOGUE_MAP = {"default": 0, "cshuffle": 1}
+
+
+class FeatureEngine(ABC):
+    """Abstract base for per-op feature extraction."""
+
+    @abstractmethod
+    def get_feature_names(self) -> list[str]:
+        """Ordered list of feature names matching the output array columns."""
+        ...
+
+    @abstractmethod
+    def get_categorical_features(self) -> list[str]:
+        """Feature names that should be treated as categorical by LightGBM."""
+        ...
+
+    @abstractmethod
+    def extract(self, problem: dict, kernel: dict) -> np.ndarray:
+        """Extract a single feature vector from a (problem, kernel) pair."""
+        ...
+
+    def extract_batch(self, df: pd.DataFrame) -> np.ndarray:
+        """Vectorized batch extraction from a DataFrame. Override for speed."""
+        names = self.get_feature_names()
+        result = np.zeros((len(df), len(names)), dtype=np.float64)
+        for i in range(len(df)):
+            row = df.iloc[i]
+            prob = row.to_dict()
+            kern = row.to_dict()
+            result[i] = self.extract(prob, kern)
+        return result
+
+    def get_parameter_space(self) -> dict[str, list]:
+        """Valid discrete values for each kernel parameter (for surrogate search)."""
+        return {}
+
+    def get_constraints(self) -> list:
+        """Multi-param constraint functions returning True if config is valid."""
+        return []
+
+    def validate_config(self, config: dict) -> bool:
+        """Check all constraints. Returns True if the config is valid."""
+        ps = self.get_parameter_space()
+        for k, valid_vals in ps.items():
+            if k in config and config[k] not in valid_vals:
+                return False
+        for constraint in self.get_constraints():
+            if not constraint(config):
+                return False
+        return True
+
+    def project_to_valid(self, config: dict) -> dict:
+        """Snap a config to the nearest valid discrete point."""
+        ps = self.get_parameter_space()
+        result = dict(config)
+        for k, valid_vals in ps.items():
+            if k not in result:
+                continue
+            v = result[k]
+            if isinstance(valid_vals[0], (int, float)):
+                result[k] = min(valid_vals, key=lambda x: abs(x - v))
+            elif v not in valid_vals:
+                result[k] = valid_vals[0]
+        return result
+
+
+class GemmUniversalFeatureEngine(FeatureEngine):
+    """Feature engine for gemm_universal kernels."""
+
+    def __init__(
+        self,
+        num_cus: int = 256,
+        lds_capacity: int = 65536,
+        max_clock_mhz: int = 2400,
+        simds_per_cu: int = 4,
+        shader_engines: int = 32,
+        max_waves_per_cu: int = 32,
+        wavefront_size: int = 64,
+        l1_cache_kb: int = 32,
+        l2_cache_kb: int = 4096,
+        l3_cache_kb: int = 262144,
+        num_xcd: int = 8,
+    ):
+        self._hw = {
+            "num_cus": num_cus,
+            "lds_capacity": lds_capacity,
+            "max_clock_mhz": max_clock_mhz,
+            "simds_per_cu": simds_per_cu,
+            "shader_engines": shader_engines,
+            "max_waves_per_cu": max_waves_per_cu,
+            "wavefront_size": wavefront_size,
+            "l1_cache_kb": l1_cache_kb,
+            "l2_cache_kb": l2_cache_kb,
+            "l3_cache_kb": l3_cache_kb,
+            "num_xcd": num_xcd,
+            "total_simds": num_cus * simds_per_cu,
+        }
+
+    def get_feature_names(self) -> list[str]:
+        return [
+            # Problem features
+            "M",
+            "N",
+            "K",
+            "split_k",
+            "log2_M",
+            "log2_N",
+            "log2_K",
+            "log2_MNK",
+            "arithmetic_intensity",
+            "aspect_ratio_mn",
+            "aspect_ratio_mk",
+            "aspect_ratio_nk",
+            "layout",
+            # Kernel features
+            "tile_m",
+            "tile_n",
+            "tile_k",
+            "warp_m",
+            "warp_n",
+            "warp_k",
+            "warp_tile_m",
+            "warp_tile_n",
+            "warp_tile_k",
+            "pipeline",
+            "scheduler",
+            "epilogue",
+            "pad_m",
+            "pad_n",
+            "pad_k",
+            "persistent",
+            "num_warps",
+            "tile_volume",
+            "tile_mn",
+            "lds_usage_estimate",
+            "lds_usage_ratio",
+            # Interaction features
+            "num_tiles_m",
+            "num_tiles_n",
+            "num_tiles_k",
+            "total_output_tiles",
+            "tile_eff_m",
+            "tile_eff_n",
+            "tile_eff_k",
+            "overall_tile_efficiency",
+            "cu_utilization",
+            # P0 FIX: Problem-to-tile ratio features
+            "ratio_M_to_tile_m",
+            "ratio_N_to_tile_n",
+            "ratio_K_to_tile_k",
+            "problem_smaller_than_tile_m",
+            "problem_smaller_than_tile_n",
+            "problem_smaller_than_tile_k",
+            "any_dim_too_small",
+            # P1 FIX: Padding requirement interaction features
+            "needs_padding_m",
+            "needs_padding_n",
+            "needs_padding_k",
+            "has_padding_when_needed_m",
+            "has_padding_when_needed_n",
+            "has_padding_when_needed_k",
+            "missing_required_padding_m",
+            "missing_required_padding_n",
+            "missing_required_padding_k",
+            "missing_any_required_padding",
+            # Hardware features
+            "hw_num_cus",
+            "hw_simds_per_cu",
+            "hw_total_simds",
+            "hw_shader_engines",
+            "hw_max_clock_mhz",
+            "hw_max_waves_per_cu",
+            "hw_wavefront_size",
+            "hw_lds_capacity",
+            "hw_l1_cache_kb",
+            "hw_l2_cache_kb",
+            "hw_l3_cache_kb",
+            "hw_num_xcd",
+        ]
+
+    def get_categorical_features(self) -> list[str]:
+        return ["layout", "pipeline", "scheduler", "epilogue"]
+
+    def extract(self, problem: dict, kernel: dict) -> np.ndarray:
+        M = int(problem.get("m", problem.get("M", 0)))
+        N = int(problem.get("n", problem.get("N", 0)))
+        K = int(problem.get("k", problem.get("K", 0)))
+        split_k = int(problem.get("split_k", 1))
+        dtype = str(problem.get("dtype", "fp8"))
+        bpe = DTYPE_BYTES.get(dtype, 1.0)
+
+        log2_M = math.log2(max(M, 1))
+        log2_N = math.log2(max(N, 1))
+        log2_K = math.log2(max(K, 1))
+        log2_MNK = math.log2(max(M * N * K, 1))
+
+        mem_bytes = (M * K + K * N + M * N) * bpe
+        ai = (2.0 * M * N * K) / max(mem_bytes, 1)
+
+        ar_mn = M / max(N, 1)
+        ar_mk = M / max(K, 1)
+        ar_nk = N / max(K, 1)
+
+        layout_code = LAYOUT_MAP.get(str(problem.get("layout", "rcr")), 0)
+
+        tile_m = int(kernel.get("tile_m", 128))
+        tile_n = int(kernel.get("tile_n", 128))
+        tile_k = int(kernel.get("tile_k", 64))
+        warp_m = int(kernel.get("warp_m", 2))
+        warp_n = int(kernel.get("warp_n", 2))
+        warp_k = int(kernel.get("warp_k", 1))
+        warp_tile_m = int(kernel.get("warp_tile_m", 32))
+        warp_tile_n = int(kernel.get("warp_tile_n", 32))
+        warp_tile_k = int(kernel.get("warp_tile_k", 16))
+
+        pipeline_code = PIPELINE_MAP.get(str(kernel.get("pipeline", "compv4")), 0)
+        scheduler_code = SCHEDULER_MAP.get(str(kernel.get("scheduler", "intrawave")), 0)
+        epilogue_code = EPILOGUE_MAP.get(str(kernel.get("epilogue", "cshuffle")), 0)
+
+        pad_m = float(kernel.get("pad_m", False))
+        pad_n = float(kernel.get("pad_n", False))
+        pad_k = float(kernel.get("pad_k", False))
+        persistent = float(kernel.get("persistent", False))
+
+        num_warps = warp_m * warp_n * warp_k
+        tile_volume = tile_m * tile_n * tile_k
+        tile_mn = tile_m * tile_n
+
+        lds_est = (tile_m * tile_k + tile_n * tile_k) * bpe
+        lds_cap = self._hw["lds_capacity"]
+        if str(kernel.get("pipeline", "")).startswith("compv4"):
+            lds_cap = 32768
+        lds_ratio = lds_est / max(lds_cap, 1)
+
+        num_tiles_m = math.ceil(M / max(tile_m, 1))
+        num_tiles_n = math.ceil(N / max(tile_n, 1))
+        num_tiles_k = math.ceil(K / max(tile_k, 1))
+        total_output_tiles = num_tiles_m * num_tiles_n
+
+        rem_m = M % tile_m if tile_m > 0 else 0
+        tile_eff_m = rem_m / tile_m if rem_m > 0 else 1.0
+        rem_n = N % tile_n if tile_n > 0 else 0
+        tile_eff_n = rem_n / tile_n if rem_n > 0 else 1.0
+        rem_k = K % tile_k if tile_k > 0 else 0
+        tile_eff_k = rem_k / tile_k if rem_k > 0 else 1.0
+        overall_eff = tile_eff_m * tile_eff_n * tile_eff_k
+
+        cu_util = total_output_tiles / max(self._hw["num_cus"], 1)
+
+        # P0 FIX: Problem-to-tile ratio features (avoid oversized tiles for tiny problems)
+        ratio_M_to_tile_m = M / max(tile_m, 1)
+        ratio_N_to_tile_n = N / max(tile_n, 1)
+        ratio_K_to_tile_k = K / max(tile_k, 1)
+
+        # Binary features: is problem dimension smaller than tile?
+        problem_smaller_than_tile_m = float(M < tile_m)
+        problem_smaller_than_tile_n = float(N < tile_n)
+        problem_smaller_than_tile_k = float(K < tile_k)
+        any_dim_too_small = float((M < tile_m) or (N < tile_n) or (K < tile_k))
+
+        # P1 FIX: Padding requirement features (does this kernel have padding when needed?)
+        needs_padding_m = float(M % tile_m != 0) if tile_m > 0 else 0.0
+        needs_padding_n = float(N % tile_n != 0) if tile_n > 0 else 0.0
+        needs_padding_k = float(K % tile_k != 0) if tile_k > 0 else 0.0
+
+        # Interaction features: kernel has padding capability when problem needs it
+        has_padding_when_needed_m = float(needs_padding_m and pad_m)
+        has_padding_when_needed_n = float(needs_padding_n and pad_n)
+        has_padding_when_needed_k = float(needs_padding_k and pad_k)
+
+        # Critical feature: missing required padding (kernel will likely fail)
+        missing_required_padding_m = float(needs_padding_m and not pad_m)
+        missing_required_padding_n = float(needs_padding_n and not pad_n)
+        missing_required_padding_k = float(needs_padding_k and not pad_k)
+        missing_any_required_padding = float(
+            missing_required_padding_m
+            or missing_required_padding_n
+            or missing_required_padding_k
+        )
+
+        hw = self._hw
+        return np.array(
+            [
+                M,
+                N,
+                K,
+                split_k,
+                log2_M,
+                log2_N,
+                log2_K,
+                log2_MNK,
+                ai,
+                ar_mn,
+                ar_mk,
+                ar_nk,
+                layout_code,
+                tile_m,
+                tile_n,
+                tile_k,
+                warp_m,
+                warp_n,
+                warp_k,
+                warp_tile_m,
+                warp_tile_n,
+                warp_tile_k,
+                pipeline_code,
+                scheduler_code,
+                epilogue_code,
+                pad_m,
+                pad_n,
+                pad_k,
+                persistent,
+                num_warps,
+                tile_volume,
+                tile_mn,
+                lds_est,
+                lds_ratio,
+                num_tiles_m,
+                num_tiles_n,
+                num_tiles_k,
+                total_output_tiles,
+                tile_eff_m,
+                tile_eff_n,
+                tile_eff_k,
+                overall_eff,
+                cu_util,
+                # P0 FIX: New ratio and binary features
+                ratio_M_to_tile_m,
+                ratio_N_to_tile_n,
+                ratio_K_to_tile_k,
+                problem_smaller_than_tile_m,
+                problem_smaller_than_tile_n,
+                problem_smaller_than_tile_k,
+                any_dim_too_small,
+                # P1 FIX: Padding requirement interaction features
+                needs_padding_m,
+                needs_padding_n,
+                needs_padding_k,
+                has_padding_when_needed_m,
+                has_padding_when_needed_n,
+                has_padding_when_needed_k,
+                missing_required_padding_m,
+                missing_required_padding_n,
+                missing_required_padding_k,
+                missing_any_required_padding,
+                hw["num_cus"],
+                hw["simds_per_cu"],
+                hw["total_simds"],
+                hw["shader_engines"],
+                hw["max_clock_mhz"],
+                hw["max_waves_per_cu"],
+                hw["wavefront_size"],
+                hw["lds_capacity"],
+                hw["l1_cache_kb"],
+                hw["l2_cache_kb"],
+                hw["l3_cache_kb"],
+                hw["num_xcd"],
+            ],
+            dtype=np.float64,
+        )
+
+    def extract_batch(self, df: pd.DataFrame) -> np.ndarray:
+        """Vectorized batch extraction -- much faster than row-by-row."""
+        n = len(df)
+        names = self.get_feature_names()
+        result = np.zeros((n, len(names)), dtype=np.float64)
+
+        M = df["m"].values.astype(np.float64)
+        N = df["n"].values.astype(np.float64)
+        K = df["k"].values.astype(np.float64)
+        split_k = df["split_k"].fillna(1).values.astype(np.float64)
+
+        dtype_col = df["dtype"].fillna("fp8")
+        bpe = dtype_col.map(DTYPE_BYTES).fillna(1.0).values
+
+        result[:, 0] = M
+        result[:, 1] = N
+        result[:, 2] = K
+        result[:, 3] = split_k
+        result[:, 4] = np.log2(np.maximum(M, 1))
+        result[:, 5] = np.log2(np.maximum(N, 1))
+        result[:, 6] = np.log2(np.maximum(K, 1))
+        result[:, 7] = np.log2(np.maximum(M * N * K, 1))
+
+        mem = (M * K + K * N + M * N) * bpe
+        result[:, 8] = (2.0 * M * N * K) / np.maximum(mem, 1)
+        result[:, 9] = M / np.maximum(N, 1)
+        result[:, 10] = M / np.maximum(K, 1)
+        result[:, 11] = N / np.maximum(K, 1)
+
+        result[:, 12] = df["layout"].map(LAYOUT_MAP).fillna(0).values
+
+        tile_m = df["tile_m"].fillna(128).values.astype(np.float64)
+        tile_n = df["tile_n"].fillna(128).values.astype(np.float64)
+        tile_k = df["tile_k"].fillna(64).values.astype(np.float64)
+        warp_m = df["warp_m"].fillna(2).values.astype(np.float64)
+        warp_n = df["warp_n"].fillna(2).values.astype(np.float64)
+        warp_k = df["warp_k"].fillna(1).values.astype(np.float64)
+        warp_tile_m = df["warp_tile_m"].fillna(32).values.astype(np.float64)
+        warp_tile_n = df["warp_tile_n"].fillna(32).values.astype(np.float64)
+        warp_tile_k = df["warp_tile_k"].fillna(16).values.astype(np.float64)
+
+        result[:, 13] = tile_m
+        result[:, 14] = tile_n
+        result[:, 15] = tile_k
+        result[:, 16] = warp_m
+        result[:, 17] = warp_n
+        result[:, 18] = warp_k
+        result[:, 19] = warp_tile_m
+        result[:, 20] = warp_tile_n
+        result[:, 21] = warp_tile_k
+
+        result[:, 22] = df["pipeline"].map(PIPELINE_MAP).fillna(0).values
+        result[:, 23] = df["scheduler"].map(SCHEDULER_MAP).fillna(0).values
+        result[:, 24] = df["epilogue"].map(EPILOGUE_MAP).fillna(0).values
+
+        result[:, 25] = df["pad_m"].fillna(False).astype(float).values
+        result[:, 26] = df["pad_n"].fillna(False).astype(float).values
+        result[:, 27] = df["pad_k"].fillna(False).astype(float).values
+        result[:, 28] = df["persistent"].fillna(False).astype(float).values
+
+        num_warps = warp_m * warp_n * warp_k
+        result[:, 29] = num_warps
+        result[:, 30] = tile_m * tile_n * tile_k
+        result[:, 31] = tile_m * tile_n
+
+        lds_est = (tile_m * tile_k + tile_n * tile_k) * bpe
+        result[:, 32] = lds_est
+        lds_cap = np.full(n, self._hw["lds_capacity"], dtype=np.float64)
+        is_compv4 = df["pipeline"].fillna("").str.startswith("compv4")
+        lds_cap[is_compv4] = 32768
+        result[:, 33] = lds_est / np.maximum(lds_cap, 1)
+
+        ntm = np.ceil(M / np.maximum(tile_m, 1))
+        ntn = np.ceil(N / np.maximum(tile_n, 1))
+        ntk = np.ceil(K / np.maximum(tile_k, 1))
+        result[:, 34] = ntm
+        result[:, 35] = ntn
+        result[:, 36] = ntk
+        result[:, 37] = ntm * ntn
+
+        rem_m = np.mod(M, np.maximum(tile_m, 1))
+        result[:, 38] = np.where(rem_m > 0, rem_m / tile_m, 1.0)
+        rem_n = np.mod(N, np.maximum(tile_n, 1))
+        result[:, 39] = np.where(rem_n > 0, rem_n / tile_n, 1.0)
+        rem_k = np.mod(K, np.maximum(tile_k, 1))
+        result[:, 40] = np.where(rem_k > 0, rem_k / tile_k, 1.0)
+        result[:, 41] = result[:, 38] * result[:, 39] * result[:, 40]
+
+        result[:, 42] = (ntm * ntn) / max(self._hw["num_cus"], 1)
+
+        # P0 FIX: Problem-to-tile ratio features
+        result[:, 43] = M / np.maximum(tile_m, 1)  # ratio_M_to_tile_m
+        result[:, 44] = N / np.maximum(tile_n, 1)  # ratio_N_to_tile_n
+        result[:, 45] = K / np.maximum(tile_k, 1)  # ratio_K_to_tile_k
+
+        # Binary features: is problem smaller than tile?
+        result[:, 46] = (M < tile_m).astype(float)  # problem_smaller_than_tile_m
+        result[:, 47] = (N < tile_n).astype(float)  # problem_smaller_than_tile_n
+        result[:, 48] = (K < tile_k).astype(float)  # problem_smaller_than_tile_k
+        result[:, 49] = ((M < tile_m) | (N < tile_n) | (K < tile_k)).astype(
+            float
+        )  # any_dim_too_small
+
+        # P1 FIX: Padding requirement features
+        pad_m_bool = df["pad_m"].fillna(False).astype(bool).values
+        pad_n_bool = df["pad_n"].fillna(False).astype(bool).values
+        pad_k_bool = df["pad_k"].fillna(False).astype(bool).values
+
+        needs_padding_m = (np.mod(M, np.maximum(tile_m, 1)) != 0)
+        needs_padding_n = (np.mod(N, np.maximum(tile_n, 1)) != 0)
+        needs_padding_k = (np.mod(K, np.maximum(tile_k, 1)) != 0)
+
+        result[:, 50] = needs_padding_m.astype(float)
+        result[:, 51] = needs_padding_n.astype(float)
+        result[:, 52] = needs_padding_k.astype(float)
+
+        # Interaction features: kernel has padding when problem needs it
+        result[:, 53] = (needs_padding_m & pad_m_bool).astype(float)  # has_padding_when_needed_m
+        result[:, 54] = (needs_padding_n & pad_n_bool).astype(float)  # has_padding_when_needed_n
+        result[:, 55] = (needs_padding_k & pad_k_bool).astype(float)  # has_padding_when_needed_k
+
+        # Critical feature: missing required padding
+        result[:, 56] = (needs_padding_m & ~pad_m_bool).astype(float)  # missing_required_padding_m
+        result[:, 57] = (needs_padding_n & ~pad_n_bool).astype(float)  # missing_required_padding_n
+        result[:, 58] = (needs_padding_k & ~pad_k_bool).astype(float)  # missing_required_padding_k
+        result[:, 59] = ((needs_padding_m & ~pad_m_bool) | (needs_padding_n & ~pad_n_bool) | (needs_padding_k & ~pad_k_bool)).astype(float)  # missing_any_required_padding
+
+        # Hardware profile features
+        hw = self._hw
+        result[:, 60] = hw["num_cus"]
+        result[:, 61] = hw["simds_per_cu"]
+        result[:, 62] = hw["total_simds"]
+        result[:, 63] = hw["shader_engines"]
+        result[:, 64] = hw["max_clock_mhz"]
+        result[:, 65] = hw["max_waves_per_cu"]
+        result[:, 66] = hw["wavefront_size"]
+        result[:, 67] = hw["lds_capacity"]
+        result[:, 68] = hw["l1_cache_kb"]
+        result[:, 69] = hw["l2_cache_kb"]
+        result[:, 70] = hw["l3_cache_kb"]
+        result[:, 71] = hw["num_xcd"]
+
+        return result
+
+    def get_parameter_space(self) -> dict[str, list]:
+        return {
+            "tile_m": [32, 64, 128, 192, 256],
+            "tile_n": [32, 64, 128, 192, 256],
+            "tile_k": [32, 64, 128, 256],
+            "warp_m": [1, 2, 4],
+            "warp_n": [1, 2, 4],
+            "warp_k": [1],
+            "warp_tile_m": [4, 16, 32, 64],
+            "warp_tile_n": [4, 16, 32, 64],
+            "warp_tile_k": [8, 16, 32, 64, 128],
+            "pipeline": list(PIPELINE_MAP.keys()),
+            "scheduler": list(SCHEDULER_MAP.keys()),
+            "epilogue": list(EPILOGUE_MAP.keys()),
+            "pad_m": [True, False],
+            "pad_n": [True, False],
+            "pad_k": [True, False],
+            "persistent": [True, False],
+        }
+
+    def get_constraints(self) -> list:
+        lds_cap = self._hw["lds_capacity"]
+
+        def _lds_constraint(cfg):
+            tm = cfg.get("tile_m", 128)
+            tn = cfg.get("tile_n", 128)
+            tk = cfg.get("tile_k", 64)
+            bpe = 1.0  # fp8 default
+            est = (tm * tk + tn * tk) * bpe
+            cap = (
+                32768 if str(cfg.get("pipeline", "")).startswith("compv4") else lds_cap
+            )
+            return est <= cap
+
+        def _warp_constraint(cfg):
+            wm = cfg.get("warp_m", 2)
+            wn = cfg.get("warp_n", 2)
+            wk = cfg.get("warp_k", 1)
+            return (wm * wn * wk) in [2, 4, 8]
+
+        return [_lds_constraint, _warp_constraint]
diff --git a/dispatcher/heuristics/generate_benchmark_data.py b/dispatcher/heuristics/generate_benchmark_data.py
new file mode 100644
index 0000000000..17c76e5967
--- /dev/null
+++ b/dispatcher/heuristics/generate_benchmark_data.py
@@ -0,0 +1,553 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+GEMM Universal Benchmark Data Generation Script
+
+This script generates training data for ML-based kernel selection heuristics by:
+1. Reading kernel configurations from the tile engine
+2. Building benchmark executables (in parallel)
+3. Running benchmarks across multiple problem sizes
+4. Outputting performance data in JSON format
+
+Usage:
+    python generate_benchmark_data.py \
+        --build_dir /tmp/build \
+        --output_dir /tmp/benchmark_data \
+        --dtype fp16 \
+        --layout rcr \
+        --num_build_jobs 4 \
+        --num_benchmark_jobs 1
+
+Requirements:
+    - ROCm-capable GPU
+    - CK tile engine built with CMake
+"""
+
+import argparse
+import json
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import List, Optional, Tuple
+import re
+
+
+@dataclass
+class KernelConfig:
+    """Represents a single kernel configuration."""
+
+    name: str
+    dtype: str
+    layout: str
+    pipeline: str
+    epilogue: str
+    scheduler: str
+    pad_m: bool
+    pad_n: bool
+    pad_k: bool
+    persistent: bool
+    tile_m: int
+    tile_n: int
+    tile_k: int
+    warp_m: int
+    warp_n: int
+    warp_k: int
+    warp_tile_m: int
+    warp_tile_n: int
+    warp_tile_k: int
+
+    @classmethod
+    def from_kernel_name(cls, name: str, dtype: str, layout: str) -> "KernelConfig":
+        """Parse kernel name to extract configuration."""
+        # Format: gemm_universal_{dtype}_{layout}_{pipeline}_{epilogue}_{scheduler}_{padM}_{padN}_{padK}_{persistent}_{tile_config}
+        # tile_config: {tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}
+
+        parts = name.split("_")
+        prefix = f"gemm_universal_{dtype}_{layout}_"
+        trait_and_tile = name[len(prefix) :]
+        trait_parts = trait_and_tile.split("_")
+
+        pipeline = trait_parts[0]
+        epilogue = trait_parts[1]
+        scheduler = trait_parts[2]
+        pad_m = trait_parts[3] == "True"
+        pad_n = trait_parts[4] == "True"
+        pad_k = trait_parts[5] == "True"
+        persistent = trait_parts[6] == "True"
+
+        # Parse tile config
+        tile_dims = trait_parts[7].split("x")
+        warp_dims = trait_parts[8].split("x")
+        warp_tile_dims = trait_parts[9].split("x")
+
+        return cls(
+            name=name,
+            dtype=dtype,
+            layout=layout,
+            pipeline=pipeline,
+            epilogue=epilogue,
+            scheduler=scheduler,
+            pad_m=pad_m,
+            pad_n=pad_n,
+            pad_k=pad_k,
+            persistent=persistent,
+            tile_m=int(tile_dims[0]),
+            tile_n=int(tile_dims[1]),
+            tile_k=int(tile_dims[2]),
+            warp_m=int(warp_dims[0]),
+            warp_n=int(warp_dims[1]),
+            warp_k=int(warp_dims[2]),
+            warp_tile_m=int(warp_tile_dims[0]),
+            warp_tile_n=int(warp_tile_dims[1]),
+            warp_tile_k=int(warp_tile_dims[2]),
+        )
+
+
+@dataclass
+class BenchmarkResult:
+    """Result of a single benchmark run."""
+
+    kernel_name: str
+    m: int
+    n: int
+    k: int
+    avg_time_ms: float
+    tflops: float
+    is_valid: bool
+    error: Optional[str] = None
+
+
+@dataclass
+class ProblemSize:
+    """GEMM problem dimensions."""
+
+    m: int
+    n: int
+    k: int
+
+
+def get_problem_sizes() -> List[ProblemSize]:
+    """
+    Generate diverse problem sizes for benchmarking.
+
+    Includes:
+    - Square matrices (powers of 2)
+    - Rectangular matrices (common in ML)
+    - LLM-specific sizes (attention, MLP)
+    - Edge cases (small, very large)
+    """
+    sizes = []
+
+    # Powers of 2 (square)
+    for p in [6, 7, 8, 9, 10, 11, 12, 13]:  # 64 to 8192
+        dim = 2**p
+        sizes.append(ProblemSize(dim, dim, dim))
+
+    # Common ML sizes (batch x hidden)
+    ml_sizes = [
+        (1, 4096, 4096),  # Single token inference
+        (8, 4096, 4096),  # Small batch
+        (32, 4096, 4096),  # Medium batch
+        (128, 4096, 4096),  # Large batch
+        (1, 4096, 11008),  # LLaMA MLP up-projection
+        (1, 11008, 4096),  # LLaMA MLP down-projection
+        (32, 4096, 11008),
+        (32, 11008, 4096),
+        (1, 8192, 8192),  # Large model
+        (32, 8192, 8192),
+        (1, 8192, 28672),  # LLaMA-70B MLP
+        (32, 8192, 28672),
+    ]
+    for m, n, k in ml_sizes:
+        sizes.append(ProblemSize(m, n, k))
+
+    # Rectangular matrices
+    rect_sizes = [
+        (1024, 4096, 1024),
+        (4096, 1024, 4096),
+        (2048, 8192, 2048),
+        (256, 256, 8192),  # Tall K
+        (8192, 8192, 256),  # Short K
+    ]
+    for m, n, k in rect_sizes:
+        sizes.append(ProblemSize(m, n, k))
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_sizes = []
+    for s in sizes:
+        key = (s.m, s.n, s.k)
+        if key not in seen:
+            seen.add(key)
+            unique_sizes.append(s)
+
+    return unique_sizes
+
+
+def load_kernel_list(build_dir: Path, dtype: str, layout: str) -> List[KernelConfig]:
+    """Load kernel configurations from the tile engine build."""
+    kernel_list_path = (
+        build_dir
+        / "tile_engine"
+        / "ops"
+        / "gemm"
+        / "gemm_universal"
+        / dtype
+        / layout
+        / "gemm_universal_kernel_list.txt"
+    )
+
+    if not kernel_list_path.exists():
+        raise FileNotFoundError(f"Kernel list not found: {kernel_list_path}")
+
+    kernels = []
+    with open(kernel_list_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            # Format: kernel_name|tile_config|trait_combo
+            parts = line.split("|")
+            kernel_name = parts[0]
+            kernels.append(KernelConfig.from_kernel_name(kernel_name, dtype, layout))
+
+    return kernels
+
+
+def build_kernel(build_dir: Path, kernel: KernelConfig) -> Tuple[str, bool, str]:
+    """
+    Build a single kernel benchmark executable.
+
+    Returns: (kernel_name, success, error_message)
+    """
+    target_name = f"benchmark_{kernel.name}"
+
+    try:
+        result = subprocess.run(
+            ["ninja", "-j1", target_name],
+            cwd=build_dir,
+            capture_output=True,
+            text=True,
+            timeout=300,  # 5 minute timeout
+        )
+
+        if result.returncode != 0:
+            return (kernel.name, False, result.stderr[:500])
+
+        return (kernel.name, True, "")
+    except subprocess.TimeoutExpired:
+        return (kernel.name, False, "Build timeout")
+    except Exception as e:
+        return (kernel.name, False, str(e))
+
+
+def run_benchmark(
+    build_dir: Path,
+    kernel: KernelConfig,
+    problem: ProblemSize,
+    warmup: int = 10,
+    repeat: int = 50,
+) -> BenchmarkResult:
+    """
+    Run benchmark for a single kernel and problem size.
+    """
+    exe_path = build_dir / "bin" / f"benchmark_{kernel.name}"
+
+    if not exe_path.exists():
+        return BenchmarkResult(
+            kernel_name=kernel.name,
+            m=problem.m,
+            n=problem.n,
+            k=problem.k,
+            avg_time_ms=0,
+            tflops=0,
+            is_valid=False,
+            error="Executable not found",
+        )
+
+    try:
+        result = subprocess.run(
+            [
+                str(exe_path),
+                f"-m={problem.m}",
+                f"-n={problem.n}",
+                f"-k={problem.k}",
+                f"-warmup={warmup}",
+                f"-repeat={repeat}",
+                "-verify=0",
+                "-json_output=true",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+
+        if result.returncode != 0:
+            # Try to parse error
+            error = result.stderr[:200] if result.stderr else result.stdout[:200]
+            return BenchmarkResult(
+                kernel_name=kernel.name,
+                m=problem.m,
+                n=problem.n,
+                k=problem.k,
+                avg_time_ms=0,
+                tflops=0,
+                is_valid=False,
+                error=error,
+            )
+
+        # Parse JSON output
+        output = result.stdout.strip()
+
+        # Try to find JSON in output
+        json_match = re.search(r"\{.*\}", output, re.DOTALL)
+        if json_match:
+            data = json.loads(json_match.group())
+            # Extract from nested perf_result object
+            perf = data.get("perf_result", {})
+            avg_time_ms = perf.get("latency(ms)", 0)
+            tflops = perf.get("tflops(TFlops)", 0)
+
+            return BenchmarkResult(
+                kernel_name=kernel.name,
+                m=problem.m,
+                n=problem.n,
+                k=problem.k,
+                avg_time_ms=avg_time_ms,
+                tflops=tflops,
+                is_valid=True,
+            )
+        else:
+            # Parse from text output
+            # Look for patterns like "avg_time: X ms" or "tflops: Y"
+            avg_time = 0.0
+            tflops = 0.0
+
+            time_match = re.search(
+                r"(?:avg[_\s]?time|latency)[:\s]+(\d+\.?\d*)\s*(?:ms)?", output, re.I
+            )
+            if time_match:
+                avg_time = float(time_match.group(1))
+
+            tflops_match = re.search(r"tflops[:\s]+(\d+\.?\d*)", output, re.I)
+            if tflops_match:
+                tflops = float(tflops_match.group(1))
+
+            # Calculate TFLOPs if not provided
+            if tflops == 0 and avg_time > 0:
+                flops = 2.0 * problem.m * problem.n * problem.k
+                tflops = flops / (avg_time * 1e-3) / 1e12
+
+            return BenchmarkResult(
+                kernel_name=kernel.name,
+                m=problem.m,
+                n=problem.n,
+                k=problem.k,
+                avg_time_ms=avg_time,
+                tflops=tflops,
+                is_valid=avg_time > 0,
+                error=None if avg_time > 0 else "Could not parse output",
+            )
+
+    except subprocess.TimeoutExpired:
+        return BenchmarkResult(
+            kernel_name=kernel.name,
+            m=problem.m,
+            n=problem.n,
+            k=problem.k,
+            avg_time_ms=0,
+            tflops=0,
+            is_valid=False,
+            error="Benchmark timeout",
+        )
+    except Exception as e:
+        return BenchmarkResult(
+            kernel_name=kernel.name,
+            m=problem.m,
+            n=problem.n,
+            k=problem.k,
+            avg_time_ms=0,
+            tflops=0,
+            is_valid=False,
+            error=str(e),
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate GEMM benchmark data for ML training"
+    )
+    parser.add_argument(
+        "--build_dir", type=str, default="/tmp/build", help="CK build directory"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="/tmp/benchmark_data",
+        help="Output directory for benchmark results",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="fp16",
+        choices=["fp16", "fp8", "bf16", "bf8"],
+        help="Data type to benchmark",
+    )
+    parser.add_argument(
+        "--layout",
+        type=str,
+        default="rcr",
+        choices=["rcr", "rrr", "crr", "ccr"],
+        help="Matrix layout to benchmark",
+    )
+    parser.add_argument(
+        "--num_build_jobs", type=int, default=4, help="Number of parallel build jobs"
+    )
+    parser.add_argument(
+        "--num_benchmark_jobs",
+        type=int,
+        default=1,
+        help="Number of parallel benchmark jobs (use 1 for accurate timing)",
+    )
+    parser.add_argument(
+        "--max_kernels",
+        type=int,
+        default=None,
+        help="Maximum number of kernels to benchmark (for testing)",
+    )
+    parser.add_argument(
+        "--skip_build",
+        action="store_true",
+        help="Skip building and only run benchmarks",
+    )
+    parser.add_argument(
+        "--warmup", type=int, default=10, help="Number of warmup iterations"
+    )
+    parser.add_argument(
+        "--repeat", type=int, default=50, help="Number of benchmark iterations"
+    )
+
+    args = parser.parse_args()
+
+    build_dir = Path(args.build_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load kernel configurations
+    print(f"Loading kernel list for {args.dtype}/{args.layout}...")
+    kernels = load_kernel_list(build_dir, args.dtype, args.layout)
+    print(f"Found {len(kernels)} kernel configurations")
+
+    if args.max_kernels:
+        kernels = kernels[: args.max_kernels]
+        print(f"Limiting to {len(kernels)} kernels")
+
+    # Build kernels
+    if not args.skip_build:
+        print(
+            f"\nBuilding {len(kernels)} kernels with {args.num_build_jobs} parallel jobs..."
+        )
+        build_results = {"success": 0, "failed": 0, "failed_kernels": []}
+
+        with ProcessPoolExecutor(max_workers=args.num_build_jobs) as executor:
+            futures = {executor.submit(build_kernel, build_dir, k): k for k in kernels}
+
+            for i, future in enumerate(as_completed(futures)):
+                kernel_name, success, error = future.result()
+                if success:
+                    build_results["success"] += 1
+                else:
+                    build_results["failed"] += 1
+                    build_results["failed_kernels"].append(
+                        {"name": kernel_name, "error": error}
+                    )
+
+                if (i + 1) % 10 == 0:
+                    print(
+                        f"  Built {i + 1}/{len(kernels)} ({build_results['success']} success, {build_results['failed']} failed)"
+                    )
+
+        print(
+            f"\nBuild complete: {build_results['success']} success, {build_results['failed']} failed"
+        )
+
+        # Save build results
+        with open(output_dir / "build_results.json", "w") as f:
+            json.dump(build_results, f, indent=2)
+
+    # Get problem sizes
+    problem_sizes = get_problem_sizes()
+    print(f"\nBenchmarking {len(problem_sizes)} problem sizes...")
+
+    # Run benchmarks
+    all_results = []
+    total_benchmarks = len(kernels) * len(problem_sizes)
+    completed = 0
+
+    print(f"Total benchmarks to run: {total_benchmarks}")
+
+    for kernel in kernels:
+        kernel_results = {
+            "kernel_config": asdict(kernel),
+            "benchmarks": [],
+        }
+
+        for problem in problem_sizes:
+            result = run_benchmark(
+                build_dir,
+                kernel,
+                problem,
+                warmup=args.warmup,
+                repeat=args.repeat,
+            )
+            kernel_results["benchmarks"].append(asdict(result))
+            completed += 1
+
+            if completed % 100 == 0:
+                print(f"  Progress: {completed}/{total_benchmarks} benchmarks complete")
+
+        all_results.append(kernel_results)
+
+        # Save intermediate results
+        intermediate_file = (
+            output_dir / f"benchmark_results_{args.dtype}_{args.layout}_partial.json"
+        )
+        with open(intermediate_file, "w") as f:
+            json.dump(all_results, f, indent=2)
+
+    # Save final results
+    final_file = output_dir / f"benchmark_results_{args.dtype}_{args.layout}.json"
+    with open(final_file, "w") as f:
+        json.dump(
+            {
+                "metadata": {
+                    "dtype": args.dtype,
+                    "layout": args.layout,
+                    "num_kernels": len(kernels),
+                    "num_problems": len(problem_sizes),
+                    "warmup": args.warmup,
+                    "repeat": args.repeat,
+                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+                },
+                "problem_sizes": [asdict(p) for p in problem_sizes],
+                "results": all_results,
+            },
+            f,
+            indent=2,
+        )
+
+    print(f"\nResults saved to {final_file}")
+
+    # Print summary
+    valid_count = sum(
+        1 for kr in all_results for br in kr["benchmarks"] if br["is_valid"]
+    )
+    print(f"Valid benchmarks: {valid_count}/{total_benchmarks}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/generate_edge_dims.py b/dispatcher/heuristics/generate_edge_dims.py
new file mode 100644
index 0000000000..f5d243a5a9
--- /dev/null
+++ b/dispatcher/heuristics/generate_edge_dims.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Supplementary edge-case benchmark generator for N=1 and K=1 dimensions.
+
+These shapes represent vector-matrix multiply (N=1), rank-1 updates (K=1),
+and other degenerate GEMM cases that stress tile efficiency and padding logic.
+"""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+def generate_edge_shapes():
+    """Generate shapes with N=1, K=1, and other single-dimension edge cases."""
+    shapes = set()
+
+    # --- N=1: vector-matrix multiply / single output column ---
+    for m in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
+        for k in [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 7168, 8192]:
+            shapes.add((m, 1, k))
+
+    # --- K=1: rank-1 update / outer product ---
+    for m in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
+        for n in [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 7168, 8192]:
+            shapes.add((m, n, 1))
+
+    # --- M=1, N=1: dot product ---
+    for k in [1, 16, 64, 256, 1024, 4096, 8192]:
+        shapes.add((1, 1, k))
+
+    # --- M=1, K=1: scalar-vector ---
+    for n in [1, 16, 64, 256, 1024, 4096, 8192]:
+        shapes.add((1, n, 1))
+
+    # --- N=1, K=1: scalar-vector ---
+    for m in [1, 16, 64, 256, 1024, 4096, 8192]:
+        shapes.add((m, 1, 1))
+
+    # --- All ones: 1x1x1 ---
+    shapes.add((1, 1, 1))
+
+    # --- Small N (2-16) ---
+    for m in [64, 256, 1024, 4096]:
+        for n in [2, 3, 4, 7, 8, 15, 16]:
+            for k in [64, 256, 1024, 4096]:
+                shapes.add((m, n, k))
+
+    # --- Small K (2-16) ---
+    for m in [64, 256, 1024, 4096]:
+        for n in [64, 256, 1024, 4096]:
+            for k in [2, 3, 4, 7, 8, 15, 16]:
+                shapes.add((m, n, k))
+
+    return sorted(shapes)
+
+
+def run_shapes(bin_dir, shapes, out_file, warmup=3, repeat=10):
+    """Run all kernels against shapes, writing streaming log."""
+    executables = sorted(Path(bin_dir).glob("benchmark_gemm_universal_fp8_rcr_*"))
+    if not executables:
+        print(f"ERROR: No executables found in {bin_dir}", file=sys.stderr)
+        return 0
+
+    total = 0
+    for idx, (m, n, k) in enumerate(shapes):
+        out_file.write("\n========================================\n")
+        out_file.write(f"Shape {idx + 1}: M={m} N={n} K={k} dtype=fp8 layout=rcr\n")
+        out_file.write("========================================\n")
+        out_file.write(f"Found {len(executables)} kernels\n")
+        out_file.flush()
+
+        for exe in executables:
+            try:
+                result = subprocess.run(
+                    [
+                        str(exe),
+                        f"-m={m}",
+                        f"-n={n}",
+                        f"-k={k}",
+                        f"-warmup={warmup}",
+                        f"-repeat={repeat}",
+                        "-verify=0",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=60,
+                )
+                output = result.stdout
+                json_start = output.find("{")
+                json_end = output.rfind("}") + 1
+                if json_start >= 0 and json_end > json_start:
+                    json_block = output[json_start:json_end]
+                    try:
+                        json.loads(json_block)
+                        out_file.write(json_block + "\n")
+                        total += 1
+                    except json.JSONDecodeError:
+                        pass
+            except (subprocess.TimeoutExpired, Exception):
+                pass
+
+        out_file.flush()
+        print(
+            f"  Shape {idx + 1}/{len(shapes)}: M={m} N={n} K={k}",
+            file=sys.stderr,
+            flush=True,
+        )
+
+    return total
+
+
+if __name__ == "__main__":
+    bin_dir = "/workspace/ck_tile/bin"
+    out_dir = Path("data/edge_dims")
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    shapes = generate_edge_shapes()
+    print(f"Generated {len(shapes)} edge-case shapes", file=sys.stderr, flush=True)
+
+    n1_count = sum(1 for m, n, k in shapes if n == 1)
+    k1_count = sum(1 for m, n, k in shapes if k == 1)
+    both1 = sum(1 for m, n, k in shapes if n == 1 and k == 1)
+    small_n = sum(1 for m, n, k in shapes if 2 <= n <= 16)
+    small_k = sum(1 for m, n, k in shapes if 2 <= k <= 16)
+    print(
+        f"  N=1: {n1_count}, K=1: {k1_count}, both=1: {both1}",
+        file=sys.stderr,
+        flush=True,
+    )
+    print(
+        f"  Small N(2-16): {small_n}, Small K(2-16): {small_k}",
+        file=sys.stderr,
+        flush=True,
+    )
+
+    batch_size = 25
+    total = 0
+    batch_idx = 0
+    for i in range(0, len(shapes), batch_size):
+        batch = shapes[i : i + batch_size]
+        batch_idx += 1
+        out_path = out_dir / f"edge_dims_batch_{batch_idx:03d}.log"
+        print(
+            f"\nBatch {batch_idx}: shapes {i + 1}-{i + len(batch)} -> {out_path}",
+            file=sys.stderr,
+            flush=True,
+        )
+
+        with open(out_path, "w") as f:
+            f.write(f"CK Tile Edge Dims Benchmark Batch {batch_idx}\n")
+            f.write("GPU ID: 0\nImplementation: gemm_universal\n\n")
+            count = run_shapes(bin_dir, batch, f, warmup=3, repeat=10)
+            total += count
+
+        print(f"  Batch {batch_idx} done: {count} results", file=sys.stderr, flush=True)
+
+    print(
+        f"\nTotal: {total} benchmarks across {len(shapes)} shapes",
+        file=sys.stderr,
+        flush=True,
+    )
diff --git a/dispatcher/heuristics/generate_wide_coverage.py b/dispatcher/heuristics/generate_wide_coverage.py
new file mode 100644
index 0000000000..e8e8116946
--- /dev/null
+++ b/dispatcher/heuristics/generate_wide_coverage.py
@@ -0,0 +1,289 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Wide-coverage benchmark data generator.
+
+Generates benchmark results for hundreds of diverse GEMM shapes across all
+corner cases: skinny M, tall N, deep K, M=1, prime dimensions, power-of-2,
+LLM inference shapes, training shapes, and edge cases.
+
+Runs all 4608 kernels in /workspace/ck_tile/bin/ against each shape and
+writes streaming log output parseable by data_pipeline.py.
+
+Usage:
+    python3 generate_wide_coverage.py --bin_dir /workspace/ck_tile/bin \
+        --out_dir data/ --batch_size 20 --warmup 3 --repeat 10
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+def generate_shape_list():
+    """Generate a comprehensive list of (M, N, K) shapes covering all corner cases.
+
+    Categories:
+      1. M=1 (single token inference) -- the hardest case
+      2. Tiny M (2-16) -- small batch inference
+      3. Small M (32-128) -- medium batch
+      4. Medium M (256-2048) -- large batch / training
+      5. Large M (4096-20480) -- very large batch
+      6. Square shapes (powers of 2)
+      7. Skinny M, tall N (M << N)
+      8. Tall M, skinny N (M >> N)
+      9. Deep K (K >> M, N) -- compute-bound
+     10. Shallow K (K << M, N) -- memory-bound
+     11. Prime dimensions -- worst-case for tiling
+     12. LLM-specific shapes (DeepSeek, LLaMA, etc.)
+     13. Non-power-of-2 common sizes
+    """
+    shapes = set()
+
+    # --- 1. M=1 (single token) across various N, K ---
+    for n in [512, 1024, 1536, 2048, 3072, 4096, 4608, 7168, 8192, 11008, 14336, 28672]:
+        for k in [256, 512, 1024, 1536, 2048, 2304, 4096, 7168, 8192]:
+            shapes.add((1, n, k))
+
+    # --- 2. Tiny M (2-16) ---
+    for m in [2, 4, 8, 16]:
+        for n in [512, 1536, 4096, 7168]:
+            for k in [256, 1024, 4096, 7168]:
+                shapes.add((m, n, k))
+
+    # --- 3. Small M (32-128) ---
+    for m in [32, 48, 64, 96, 128]:
+        for n in [512, 1536, 4096, 7168, 8192]:
+            for k in [256, 512, 2048, 4096, 7168]:
+                shapes.add((m, n, k))
+
+    # --- 4. Medium M (256-2048) ---
+    for m in [256, 384, 512, 768, 1024, 1536, 2048]:
+        for n in [512, 1536, 4096, 7168]:
+            for k in [256, 1024, 2048, 4096, 7168]:
+                shapes.add((m, n, k))
+
+    # --- 5. Large M (4096-20480) ---
+    for m in [4096, 8192, 12288, 16384, 20480]:
+        for n in [512, 1536, 4096, 7168]:
+            for k in [256, 1024, 2048, 7168]:
+                shapes.add((m, n, k))
+
+    # --- 6. Square shapes (powers of 2) ---
+    for p in range(5, 14):  # 32 to 8192
+        d = 2**p
+        shapes.add((d, d, d))
+
+    # --- 7. Skinny M, tall N ---
+    for m in [1, 4, 16, 64]:
+        for n in [8192, 16384, 28672]:
+            for k in [1024, 4096, 8192]:
+                shapes.add((m, n, k))
+
+    # --- 8. Tall M, skinny N ---
+    for m in [4096, 8192, 16384]:
+        for n in [32, 64, 128, 256]:
+            for k in [1024, 4096]:
+                shapes.add((m, n, k))
+
+    # --- 9. Deep K (K >> M, N) ---
+    for m in [16, 64, 256]:
+        for n in [16, 64, 256]:
+            for k in [4096, 8192, 16384, 32768]:
+                shapes.add((m, n, k))
+
+    # --- 10. Shallow K (K << M, N) ---
+    for m in [1024, 4096, 8192]:
+        for n in [1024, 4096, 8192]:
+            for k in [16, 32, 64, 128]:
+                shapes.add((m, n, k))
+
+    # --- 11. Prime dimensions ---
+    primes = [17, 31, 37, 127, 251, 509, 1021, 2039, 4093]
+    for p in primes:
+        shapes.add((p, p, p))
+    for p in primes[:5]:
+        shapes.add((p, 4096, 4096))
+        shapes.add((4096, p, 4096))
+        shapes.add((4096, 4096, p))
+
+    # --- 12. LLM-specific shapes ---
+    llm_shapes = [
+        # DeepSeek MoE
+        (1, 1536, 7168),
+        (1, 4608, 7168),
+        (1, 7168, 2048),
+        (1, 7168, 2304),
+        (1, 7168, 256),
+        (1, 576, 7168),
+        (1, 512, 7168),
+        (1, 3072, 1536),
+        # LLaMA-7B
+        (1, 4096, 4096),
+        (32, 4096, 4096),
+        (128, 4096, 4096),
+        (1, 4096, 11008),
+        (32, 4096, 11008),
+        (1, 11008, 4096),
+        (32, 11008, 4096),
+        # LLaMA-70B
+        (1, 8192, 8192),
+        (32, 8192, 8192),
+        (128, 8192, 8192),
+        (1, 8192, 28672),
+        (32, 8192, 28672),
+        (1, 28672, 8192),
+        # GPT-style attention
+        (128, 128, 64),
+        (128, 128, 128),
+        (256, 256, 64),
+        (512, 512, 64),
+        (1024, 1024, 64),
+        (2048, 2048, 64),
+    ]
+    for s in llm_shapes:
+        shapes.add(s)
+
+    # --- 13. Non-power-of-2 common sizes ---
+    for m in [48, 96, 192, 384, 576, 768, 1152, 1536, 2304, 3072, 4608, 6144]:
+        shapes.add((m, m, m))
+        shapes.add((m, 4096, 4096))
+
+    sorted_shapes = sorted(shapes)
+    return sorted_shapes
+
+
+def run_shape_batch(bin_dir, shapes, out_file, warmup=3, repeat=10):
+    """Run all kernels against a batch of shapes, writing streaming log output."""
+    executables = sorted(Path(bin_dir).glob("benchmark_gemm_universal_fp8_rcr_*"))
+    if not executables:
+        print(f"ERROR: No executables found in {bin_dir}", file=sys.stderr)
+        return 0
+
+    total_benchmarks = 0
+
+    for shape_idx, (m, n, k) in enumerate(shapes):
+        out_file.write("\n========================================\n")
+        out_file.write(
+            f"Shape {shape_idx + 1}: M={m} N={n} K={k} dtype=fp8 layout=rcr\n"
+        )
+        out_file.write("========================================\n")
+        out_file.write(f"Found {len(executables)} kernels\n")
+        out_file.flush()
+
+        for exe in executables:
+            try:
+                result = subprocess.run(
+                    [
+                        str(exe),
+                        f"-m={m}",
+                        f"-n={n}",
+                        f"-k={k}",
+                        f"-warmup={warmup}",
+                        f"-repeat={repeat}",
+                        "-verify=0",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=60,
+                )
+                output = result.stdout
+                # Extract JSON block from output
+                json_start = output.find("{")
+                json_end = output.rfind("}") + 1
+                if json_start >= 0 and json_end > json_start:
+                    json_block = output[json_start:json_end]
+                    try:
+                        json.loads(json_block)
+                        out_file.write(json_block + "\n")
+                        total_benchmarks += 1
+                    except json.JSONDecodeError:
+                        pass
+            except (subprocess.TimeoutExpired, Exception):
+                pass
+
+        out_file.flush()
+        elapsed_kernels = len(executables)
+        print(
+            f"  Shape {shape_idx + 1}/{len(shapes)}: M={m} N={n} K={k} "
+            f"({elapsed_kernels} kernels)",
+            file=sys.stderr,
+            flush=True,
+        )
+
+    return total_benchmarks
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate wide-coverage benchmark data"
+    )
+    parser.add_argument(
+        "--bin_dir",
+        default="/workspace/ck_tile/bin",
+        help="Directory with benchmark executables",
+    )
+    parser.add_argument("--out_dir", default="data", help="Output directory")
+    parser.add_argument(
+        "--batch_size", type=int, default=25, help="Shapes per output file"
+    )
+    parser.add_argument("--warmup", type=int, default=3)
+    parser.add_argument("--repeat", type=int, default=10)
+    parser.add_argument(
+        "--max_shapes", type=int, default=None, help="Limit total shapes (for testing)"
+    )
+    args = parser.parse_args()
+
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    shapes = generate_shape_list()
+    if args.max_shapes:
+        shapes = shapes[: args.max_shapes]
+
+    print(f"Generated {len(shapes)} unique shapes", file=sys.stderr, flush=True)
+    print(f"Bin dir: {args.bin_dir}", file=sys.stderr, flush=True)
+    print(f"Output dir: {args.out_dir}", file=sys.stderr, flush=True)
+    print(f"Batch size: {args.batch_size}", file=sys.stderr, flush=True)
+
+    total = 0
+    batch_idx = 0
+    for i in range(0, len(shapes), args.batch_size):
+        batch = shapes[i : i + args.batch_size]
+        batch_idx += 1
+        out_path = out_dir / f"wide_coverage_batch_{batch_idx:03d}.log"
+
+        print(
+            f"\nBatch {batch_idx}: shapes {i + 1}-{i + len(batch)} -> {out_path}",
+            file=sys.stderr,
+            flush=True,
+        )
+
+        with open(out_path, "w") as f:
+            f.write(f"CK Tile Wide Coverage Benchmark Batch {batch_idx}\n")
+            f.write("GPU ID: 0\n")
+            f.write("Implementation: gemm_universal\n\n")
+            count = run_shape_batch(
+                args.bin_dir, batch, f, warmup=args.warmup, repeat=args.repeat
+            )
+            total += count
+
+        print(
+            f"  Batch {batch_idx} complete: {count} benchmarks",
+            file=sys.stderr,
+            flush=True,
+        )
+
+    print(
+        f"\nTotal: {total} benchmarks across {len(shapes)} shapes",
+        file=sys.stderr,
+        flush=True,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/ml_heuristic_sweep.py b/dispatcher/heuristics/ml_heuristic_sweep.py
new file mode 100644
index 0000000000..7190a19678
--- /dev/null
+++ b/dispatcher/heuristics/ml_heuristic_sweep.py
@@ -0,0 +1,867 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+ML Heuristic Sweep: Comprehensive GEMM Performance Evaluation
+
+Sweeps across diverse problem shapes with ML-based kernel selection to measure
+TFLOPS performance. Supports multiple dtypes (fp16, bf16, fp8) and validates
+ML model predictions by executing kernels on GPU.
+
+Shape Constraints (fp16/bf16 on gfx950):
+- M >= 1 (any M is valid)
+- N % 8 == 0 AND N >= 64
+- K % 2 == 0 AND K >= 32
+
+Usage:
+    python ml_heuristic_sweep.py --dtype fp16 --num_shapes 256
+    python ml_heuristic_sweep.py --dtypes fp16 bf16 --output sweep_results.csv
+    python ml_heuristic_sweep.py --dtype fp16 --dry_run  # Prediction only, no GPU execution
+"""
+
+import sys
+import argparse
+import time
+import csv
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Tuple
+
+# Add parent directories to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent / "python"))
+
+import numpy as np
+
+from ctypes_utils import (
+    KernelConfig,
+    setup_gemm_dispatcher,
+    cleanup_gemm,
+)
+
+try:
+    from predict import Predictor
+    # from feature_engine import GemmUniversalFeatureEngine
+
+    HAS_ML = True
+except ImportError:
+    HAS_ML = False
+    print("WARNING: ML heuristic modules not available. Will use first-fit selection.")
+
+
+@dataclass
+class KernelSpec:
+    """Kernel specification for ML heuristic"""
+
+    name: str
+    tile_m: int
+    tile_n: int
+    tile_k: int
+    pipeline: str = "compv3"
+    scheduler: str = "intrawave"
+    wave_m: int = 2
+    wave_n: int = 2
+    wave_k: int = 1
+    warp_m: int = 32
+    warp_n: int = 32
+    warp_k: int = 16
+
+
+# Comprehensive kernel pool covering diverse tile sizes and configurations
+KERNEL_POOL = [
+    # Small tiles (64x64)
+    KernelSpec(
+        "s_64x64_k32_v3", 64, 64, 32, "compv3", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    KernelSpec(
+        "s_64x64_k64_v3", 64, 64, 64, "compv3", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    KernelSpec(
+        "s_64x64_k128_v3", 64, 64, 128, "compv3", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    KernelSpec(
+        "s_64x64_k64_v4", 64, 64, 64, "compv4", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    KernelSpec("s_64x64_k64_mem", 64, 64, 64, "mem", "intrawave", 2, 2, 1, 16, 16, 16),
+    KernelSpec(
+        "s_64x64_k128_mem", 64, 64, 128, "mem", "intrawave", 2, 2, 1, 16, 16, 16
+    ),
+    # Medium tiles (128x128)
+    KernelSpec("m_128x128_k32_v3", 128, 128, 32, "compv3", "intrawave"),
+    KernelSpec("m_128x128_k64_v3", 128, 128, 64, "compv3", "intrawave"),
+    KernelSpec("m_128x128_k128_v3", 128, 128, 128, "compv3", "intrawave"),
+    KernelSpec("m_128x128_k64_v4", 128, 128, 64, "compv4", "intrawave"),
+    KernelSpec("m_128x128_k128_v4", 128, 128, 128, "compv4", "intrawave"),
+    KernelSpec("m_128x128_k64_mem", 128, 128, 64, "mem", "intrawave"),
+    KernelSpec("m_128x128_k128_mem", 128, 128, 128, "mem", "intrawave"),
+    # Rectangular medium (M != N)
+    KernelSpec(
+        "r_64x128_k32_v3", 64, 128, 32, "compv3", "intrawave", 2, 2, 1, 16, 32, 16
+    ),
+    KernelSpec(
+        "r_128x64_k32_v3", 128, 64, 32, "compv3", "intrawave", 2, 2, 1, 32, 16, 16
+    ),
+    KernelSpec(
+        "r_64x128_k64_v3", 64, 128, 64, "compv3", "intrawave", 2, 2, 1, 16, 32, 16
+    ),
+    KernelSpec(
+        "r_128x64_k64_v3", 128, 64, 64, "compv3", "intrawave", 2, 2, 1, 32, 16, 16
+    ),
+    KernelSpec(
+        "r_64x256_k32_v3", 64, 256, 32, "compv3", "intrawave", 2, 2, 1, 16, 32, 16
+    ),
+    KernelSpec(
+        "r_256x64_k32_v3", 256, 64, 32, "compv3", "intrawave", 2, 2, 1, 32, 16, 16
+    ),
+    # Large tiles (256x256)
+    KernelSpec("l_256x128_k32_v3", 256, 128, 32, "compv3", "intrawave"),
+    KernelSpec("l_128x256_k32_v3", 128, 256, 32, "compv3", "intrawave"),
+    KernelSpec("l_256x256_k32_v3", 256, 256, 32, "compv3", "intrawave"),
+    KernelSpec("l_256x256_k64_v3", 256, 256, 64, "compv3", "intrawave"),
+    KernelSpec("l_256x256_k64_v4", 256, 256, 64, "compv4", "intrawave"),
+    # Interwave variants
+    KernelSpec("m_128x128_k64_iw_v3", 128, 128, 64, "compv3", "interwave"),
+    KernelSpec("m_128x128_k128_iw_v3", 128, 128, 128, "compv3", "interwave"),
+    KernelSpec("l_256x256_k32_iw_v3", 256, 256, 32, "compv3", "interwave"),
+]
+
+
+def generate_problem_shapes(num_shapes: int = 1024) -> List[Tuple[int, int, int]]:
+    """
+    Generate diverse problem shapes with hardware constraints:
+    - M >= 1 (any M is valid, including tiny M for inference)
+    - N % 8 == 0 AND N >= 64 (hardware alignment requirement)
+    - K % 2 == 0 AND K >= 32 (fp16 requirement)
+
+    Covers:
+    - Powers of 2 (square and rectangular)
+    - ML workloads (LLM attention, MLP, batch inference)
+    - Non-power-of-2 dimensions (aligned to constraints)
+    - Edge cases (tiny M, very large matrices, extreme aspect ratios)
+    """
+    shapes = []
+
+    # 1. Powers of 2 - Square (64 to 8192) with K variations
+    for p in range(6, 14):  # 2^6=64 to 2^13=8192
+        dim = 2**p
+        shapes.append((dim, dim, dim))
+        if dim >= 128:
+            # K variations (must be even and >= 32)
+            shapes.append((dim, dim, dim // 2))
+            shapes.append((dim, dim, dim * 2))
+            shapes.append((dim, dim, max(32, dim // 4)))
+
+    # 2. Small batch inference (1-256 batch, common hidden dims)
+    # N must be multiple of 8 and >= 64
+    hidden_dims = [768, 1024, 2048, 3072, 4096, 5120, 8192, 11008, 12288, 16384]
+    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
+
+    for hidden in hidden_dims:
+        for batch in batch_sizes[:8]:
+            shapes.append((batch, hidden, hidden))
+            if hidden >= 4096:
+                # LLM MLP projections (ensure K is even)
+                k_mlp = hidden * 3 // 4
+                if k_mlp % 2 == 1:
+                    k_mlp += 1  # Make even
+                if k_mlp >= 32:
+                    shapes.append((batch, hidden, k_mlp))
+                    shapes.append((batch, k_mlp, hidden))
+
+    # 3. Attention patterns (seq_len x head_dim)
+    # seq_len can be any value >= 1, total_dim must be multiple of 8
+    seq_lens = [128, 256, 512, 1024, 2048, 4096, 8192]
+    head_dims = [64, 80, 96, 128, 256]
+    num_heads = [8, 12, 16, 32, 40, 64]
+
+    for seq in seq_lens:
+        for head_dim in head_dims:
+            for nh in num_heads[:4]:
+                total_dim = nh * head_dim
+                # total_dim should be multiple of 8 (naturally satisfied for most cases)
+                if total_dim % 8 == 0 and total_dim >= 64:
+                    # head_dim must be even for K
+                    if head_dim % 2 == 0 and head_dim >= 32:
+                        shapes.append((seq, total_dim, head_dim))
+                        shapes.append((seq, head_dim, total_dim))
+
+    # 4. Rectangular matrices (extreme aspect ratios)
+    # All dims must satisfy constraints
+    dims_m = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
+    dims_n = [64, 128, 256, 512, 1024, 2048, 4096, 8192]  # N >= 64, N % 8 == 0
+    dims_k = [
+        32,
+        64,
+        128,
+        256,
+        512,
+        1024,
+        2048,
+        4096,
+        8192,
+        16384,
+    ]  # K >= 32, K % 2 == 0
+
+    # Sample to avoid explosion
+    for i, m in enumerate(dims_m):
+        for j, n in enumerate(dims_n):
+            for _l, k in enumerate(dims_k):
+                if (i + j + _l) % 3 == 0:  # Stratified sampling
+                    shapes.append((m, n, k))
+
+    # 5. Non-power-of-2 dimensions (aligned to constraints)
+    # N values: multiples of 8, >= 64
+    non_pow2_n = [
+        72,
+        80,
+        88,
+        96,
+        104,
+        112,
+        120,
+        136,
+        144,
+        152,
+        160,
+        176,
+        184,
+        192,
+        200,
+        224,
+        240,
+        272,
+        288,
+        304,
+        320,
+        336,
+        352,
+        368,
+        384,
+        400,
+        416,
+        448,
+        480,
+        544,
+        576,
+        640,
+        672,
+        704,
+        736,
+        768,
+        800,
+        832,
+        896,
+        960,
+        1088,
+        1152,
+        1216,
+        1280,
+        1344,
+        1408,
+        1472,
+        1536,
+        1600,
+        1664,
+        1728,
+        1792,
+        1856,
+        1920,
+        2176,
+        2304,
+        2432,
+        2560,
+        2688,
+        2816,
+        2944,
+        3072,
+        3200,
+        3328,
+        3456,
+        3584,
+        3712,
+        3840,
+        3968,
+        4224,
+        4352,
+        4480,
+        4608,
+        4736,
+        4864,
+        4992,
+    ]
+
+    # K values: even numbers >= 32
+    non_pow2_k = [
+        34,
+        36,
+        38,
+        40,
+        42,
+        44,
+        48,
+        50,
+        52,
+        56,
+        60,
+        66,
+        68,
+        72,
+        76,
+        80,
+        88,
+        96,
+        100,
+        112,
+        120,
+        136,
+        144,
+        160,
+        176,
+        192,
+        224,
+        240,
+        272,
+        288,
+        320,
+        352,
+        384,
+        416,
+        448,
+        480,
+        544,
+        576,
+        640,
+        672,
+        704,
+        768,
+        800,
+        832,
+        896,
+        960,
+        1088,
+        1152,
+        1280,
+        1344,
+        1408,
+        1536,
+        1600,
+        1664,
+        1792,
+        1920,
+    ]
+
+    # M values: any value >= 1
+    non_pow2_m = [
+        1,
+        3,
+        5,
+        7,
+        9,
+        11,
+        13,
+        15,
+        17,
+        19,
+        23,
+        27,
+        31,
+        33,
+        37,
+        41,
+        47,
+        51,
+        57,
+        63,
+        65,
+        71,
+        79,
+        87,
+        95,
+        97,
+        111,
+        119,
+        127,
+        129,
+        143,
+        159,
+        175,
+        191,
+        193,
+        223,
+        239,
+        255,
+        257,
+        287,
+        319,
+        351,
+        383,
+        385,
+        447,
+        479,
+        511,
+        513,
+        575,
+        639,
+        703,
+        767,
+        769,
+        895,
+        959,
+        1023,
+        1025,
+    ]
+
+    # Sample non-power-of-2 shapes
+    for i, m in enumerate(non_pow2_m[:30]):
+        for j, n in enumerate(non_pow2_n[:20]):
+            for _l, k in enumerate(non_pow2_k[:15]):
+                if (i + j + _l) % 4 == 0:  # Stratified sampling
+                    shapes.append((m, n, k))
+
+    # 6. Very tall K (memory-bound) - ensure N % 8 == 0, K % 2 == 0
+    for mn in [64, 128, 256, 512, 1024]:
+        for k in [4096, 8192, 16384]:
+            shapes.append((mn, mn, k))
+
+    # 7. Very short K (compute-bound) - ensure K >= 32, K % 2 == 0
+    for mn in [512, 1024, 2048, 4096]:
+        for k in [32, 64, 128]:
+            shapes.append((mn, mn, k))
+
+    # 8. Tiny M (edge cases for batch-1 inference)
+    for m in [1, 2, 4, 8, 16, 32]:
+        for n in [64, 128, 256, 512, 1024, 2048]:  # N >= 64, N % 8 == 0
+            for k in [32, 64, 128, 256, 512]:  # K >= 32, K % 2 == 0
+                shapes.append((m, n, k))
+
+    # 9. Stress test sizes (aligned to constraints)
+    stress_sizes = [
+        (10000, 10000, 10000),
+        (1000, 10000, 1000),
+        (1000, 1000, 10000),
+        (5000, 5000, 5000),
+        (7168, 7168, 7168),  # Common LLM hidden dim
+        (8192, 11008, 8192),  # LLaMA MLP dimensions
+    ]
+    shapes.extend(stress_sizes)
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_shapes = []
+    for s in shapes:
+        if s not in seen:
+            seen.add(s)
+            unique_shapes.append(s)
+
+    # Filter to ensure all shapes meet constraints
+    valid_shapes = []
+    for m, n, k in unique_shapes:
+        if m >= 1 and n >= 64 and n % 8 == 0 and k >= 32 and k % 2 == 0:
+            valid_shapes.append((m, n, k))
+
+    # Sample down to target number if we have too many
+    if len(valid_shapes) > num_shapes:
+        # Stratified sampling to preserve diversity
+        step = len(valid_shapes) / num_shapes
+        valid_shapes = [valid_shapes[int(i * step)] for i in range(num_shapes)]
+
+    return valid_shapes
+
+
+def spec_to_feature_dict(spec: KernelSpec, dtype: str, layout: str) -> dict:
+    """Convert KernelSpec to feature dict for ML predictor"""
+    return {
+        "kernel_name": spec.name,
+        "tile_m": spec.tile_m,
+        "tile_n": spec.tile_n,
+        "tile_k": spec.tile_k,
+        "warp_m": spec.wave_m,
+        "warp_n": spec.wave_n,
+        "warp_k": spec.wave_k,
+        "warp_tile_m": spec.warp_m,
+        "warp_tile_n": spec.warp_n,
+        "warp_tile_k": spec.warp_k,
+        "pipeline": spec.pipeline,
+        "scheduler": spec.scheduler,
+        "epilogue": "cshuffle",
+        "pad_m": True,  # Enable padding to support arbitrary M dimensions
+        "pad_n": True,  # Enable padding to support arbitrary N dimensions
+        "pad_k": True,  # Enable padding to support arbitrary K dimensions
+        "persistent": False,
+        "dtype": dtype,
+        "layout": layout,
+    }
+
+
+def spec_to_kernel_config(
+    spec: KernelSpec, dtype: str, arch: str, dtype_acc: str = "fp32"
+) -> KernelConfig:
+    """Convert KernelSpec to KernelConfig for dispatcher"""
+    return KernelConfig(
+        dtype_a=dtype,
+        dtype_b=dtype,
+        dtype_c=dtype,
+        dtype_acc=dtype_acc,
+        layout_a="row",
+        layout_b="col",
+        layout_c="row",
+        tile_m=spec.tile_m,
+        tile_n=spec.tile_n,
+        tile_k=spec.tile_k,
+        wave_m=spec.wave_m,
+        wave_n=spec.wave_n,
+        wave_k=spec.wave_k,
+        warp_m=spec.warp_m,
+        warp_n=spec.warp_n,
+        warp_k=spec.warp_k,
+        pipeline=spec.pipeline,
+        scheduler=spec.scheduler,
+        epilogue="cshuffle",
+        gfx_arch=arch,
+    )
+
+
+def ml_select_kernel(
+    predictor, pool: List[KernelSpec], M: int, N: int, K: int, dtype: str, layout: str
+) -> Tuple[KernelSpec, float]:
+    """Use ML model to select best kernel"""
+    if not HAS_ML or predictor is None:
+        # Fallback: select first kernel
+        return pool[0], 0.0
+
+    problem = {"m": M, "n": N, "k": K, "dtype": dtype, "layout": layout, "split_k": 1}
+    kernel_dicts = [spec_to_feature_dict(s, dtype, layout) for s in pool]
+
+    ranked = predictor.rank_kernels(problem, kernel_dicts)
+    if not ranked:
+        return pool[0], 0.0
+
+    best_name, best_tflops = ranked[0]
+    best_spec = next((s for s in pool if s.name == best_name), pool[0])
+    return best_spec, best_tflops
+
+
+def run_single_gemm(
+    M: int,
+    N: int,
+    K: int,
+    dtype: str,
+    arch: str,
+    predictor,
+    dry_run: bool = False,
+    dtype_acc: str = "fp32",
+) -> dict:
+    """Run a single GEMM with ML heuristic selection"""
+
+    # Select kernel via ML heuristic
+    t0 = time.time()
+    best_spec, pred_tflops = ml_select_kernel(
+        predictor, KERNEL_POOL, M, N, K, dtype, "rcr"
+    )
+    select_time_ms = (time.time() - t0) * 1000
+
+    result = {
+        "M": M,
+        "N": N,
+        "K": K,
+        "dtype": dtype,
+        "selected_kernel": best_spec.name,
+        "predicted_tflops": pred_tflops,
+        "selection_time_ms": select_time_ms,
+        "actual_time_ms": 0,
+        "actual_tflops": 0,
+        "status": "SKIP" if dry_run else "PENDING",
+        "error": None,
+    }
+
+    if dry_run:
+        return result
+
+    # Build and run kernel
+    config = spec_to_kernel_config(best_spec, dtype, arch, dtype_acc)
+
+    try:
+        setup = setup_gemm_dispatcher(
+            config=config,
+            registry_name=f"sweep_{dtype}_{best_spec.name}",
+            verbose=False,
+            auto_rebuild=True,
+        )
+
+        if not setup.success:
+            result["status"] = "BUILD_FAIL"
+            result["error"] = "Failed to build kernel"
+            cleanup_gemm()
+            return result
+
+        dispatcher = setup.dispatcher
+        if not dispatcher.is_supported(M, N, K):
+            result["status"] = "UNSUPPORTED"
+            result["error"] = "Problem size not supported by kernel"
+            cleanup_gemm()
+            return result
+
+        # Create input data
+        np_dtype = {"fp16": np.float16, "bf16": np.float16, "fp8": np.float16}[dtype]
+        np.random.seed(42)
+        A = (np.random.randn(M, K) * 0.1).astype(np_dtype)
+        B = (np.random.randn(K, N) * 0.1).astype(np_dtype)
+
+        # Run GEMM
+        exec_result = dispatcher.run(A, B, M, N, K)
+
+        if exec_result.success:
+            result["actual_time_ms"] = exec_result.time_ms
+            result["actual_tflops"] = exec_result.tflops
+            result["status"] = "SUCCESS"
+        else:
+            # Decode status code for better error message
+            status_messages = {
+                0: "Success",
+                -1: "GPU/HIP error (check permissions, memory, or kernel validity)",
+                -2: "No suitable kernel found for this problem size",
+            }
+            error_msg = status_messages.get(exec_result.status, f"Unknown error (status={exec_result.status})")
+            result["status"] = "RUN_FAIL"
+            result["error"] = f"{error_msg} (status_code={exec_result.status})"
+
+            # Print detailed error for debugging
+            print(f"  ERROR: {error_msg}")
+            print(f"  Status code: {exec_result.status}")
+            print(f"  Time returned: {exec_result.time_ms}")
+            print(f"  Kernel: {exec_result.kernel_name}")
+
+        cleanup_gemm()
+
+    except Exception as e:
+        result["status"] = "ERROR"
+        result["error"] = str(e)[:200]
+        cleanup_gemm()
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="ML Heuristic Sweep: Test GEMM across many shapes and dtypes"
+    )
+    parser.add_argument(
+        "--dtypes",
+        nargs="+",
+        default=["fp16"],
+        choices=["fp16", "bf16", "fp8"],
+        help="Data types to test (default: fp16)",
+    )
+    parser.add_argument(
+        "--arch", default="gfx950", help="GPU architecture (default: gfx950)"
+    )
+    parser.add_argument(
+        "--dtype_acc",
+        default="fp32",
+        choices=["fp16", "fp32"],
+        help="Accumulator data type (default: fp32)",
+    )
+    parser.add_argument(
+        "--model_dir",
+        default=None,
+        help="Path to model directory (auto-detect if not specified)",
+    )
+    parser.add_argument(
+        "--num_shapes",
+        type=int,
+        default=256,
+        help="Number of problem shapes to test (default: 256)",
+    )
+    parser.add_argument(
+        "--output",
+        default="ml_heuristic_sweep_results.csv",
+        help="Output CSV file path",
+    )
+    parser.add_argument(
+        "--dry_run",
+        action="store_true",
+        help="Only predict, do not run kernels (fast validation)",
+    )
+
+    args = parser.parse_args()
+
+    # Setup ML predictor
+    predictor = None
+    if HAS_ML:
+        if args.model_dir is None:
+            # Auto-detect model directory based on first dtype
+            first_dtype = args.dtypes[0]
+            heuristics_dir = Path(__file__).parent
+            model_candidates = [
+                heuristics_dir / "models" / f"gemm_universal_{first_dtype}_{args.arch}",
+            ]
+            for model_dir in model_candidates:
+                if model_dir.exists():
+                    args.model_dir = str(model_dir)
+                    break
+
+        if args.model_dir and Path(args.model_dir).exists():
+            try:
+                predictor = Predictor(args.model_dir)
+                print(f"✓ Loaded ML model from: {args.model_dir}")
+            except Exception as e:
+                print(f"⚠ Failed to load ML model: {e}")
+                print("  Will use first-fit selection instead")
+        else:
+            print(f"⚠ Model directory not found: {args.model_dir}")
+            print("  Will use first-fit selection instead")
+
+    # Generate problem shapes
+    print(f"\nGenerating {args.num_shapes} problem shapes...")
+    shapes = generate_problem_shapes(args.num_shapes)
+    print(
+        f"✓ Generated {len(shapes)} valid shapes (M>=1, N%8==0, N>=64, K%2==0, K>=32)"
+    )
+
+    # Validate all shapes meet constraints
+    invalid = [
+        (m, n, k)
+        for m, n, k in shapes
+        if not (m >= 1 and n >= 64 and n % 8 == 0 and k >= 32 and k % 2 == 0)
+    ]
+    if invalid:
+        print(f"⚠ WARNING: {len(invalid)} shapes violate constraints!")
+        print(f"  First few: {invalid[:5]}")
+
+    # Print configuration
+    print("\n" + "=" * 80)
+    print("  ML Heuristic Sweep Configuration")
+    print("=" * 80)
+    print(
+        f"  Model:          {args.model_dir if args.model_dir else 'first-fit (no ML)'}"
+    )
+    print(f"  Data types:     {', '.join(args.dtypes)}")
+    print(f"  Accumulator:    {args.dtype_acc}")
+    print(f"  Architecture:   {args.arch}")
+    print(f"  Kernel pool:    {len(KERNEL_POOL)} kernels")
+    print(f"  Problem shapes: {len(shapes)}")
+    print(f"  Total tests:    {len(shapes) * len(args.dtypes)}")
+    print(
+        f"  Mode:           {'DRY RUN (prediction only)' if args.dry_run else 'FULL RUN (execute kernels)'}"
+    )
+    print(f"  Output:         {args.output}")
+    print("=" * 80)
+
+    # Open output CSV
+    csv_file = open(args.output, "w", newline="")
+    csv_writer = csv.DictWriter(
+        csv_file,
+        fieldnames=[
+            "dtype",
+            "M",
+            "N",
+            "K",
+            "selected_kernel",
+            "predicted_tflops",
+            "selection_time_ms",
+            "actual_time_ms",
+            "actual_tflops",
+            "status",
+            "error",
+        ],
+    )
+    csv_writer.writeheader()
+
+    # Run sweep
+    total_tests = len(shapes) * len(args.dtypes)
+    completed = 0
+    start_time = time.time()
+
+    print("\nStarting sweep... (Ctrl+C to stop and save partial results)\n")
+
+    try:
+        for dtype in args.dtypes:
+            print(f"\n{'=' * 80}")
+            print(f"  Testing dtype: {dtype.upper()}")
+            print(f"{'=' * 80}\n")
+
+            for i, (M, N, K) in enumerate(shapes):
+                result = run_single_gemm(
+                    M, N, K, dtype, args.arch, predictor, args.dry_run, args.dtype_acc
+                )
+
+                # Write to CSV
+                csv_writer.writerow(result)
+                csv_file.flush()
+
+                completed += 1
+
+                # Progress update
+                if completed % 10 == 0 or result["status"] != "SUCCESS":
+                    elapsed = time.time() - start_time
+                    rate = completed / elapsed if elapsed > 0 else 0
+                    eta = (total_tests - completed) / rate if rate > 0 else 0
+
+                    status_emoji = {
+                        "SUCCESS": "✓",
+                        "SKIP": "→",
+                        "BUILD_FAIL": "✗",
+                        "UNSUPPORTED": "○",
+                        "RUN_FAIL": "✗",
+                        "ERROR": "✗",
+                    }.get(result["status"], "?")
+
+                    print(
+                        f"  [{completed:4d}/{total_tests}] {status_emoji} "
+                        f"{dtype:4s} {M:5d}x{N:5d}x{K:5d} → "
+                        f"{result['selected_kernel']:20s} "
+                        f"pred={result['predicted_tflops']:6.1f} "
+                        f"actual={result['actual_tflops']:6.1f} TFLOPS  "
+                        f"[{rate:.1f} tests/s, ETA {eta / 60:.1f}m]"
+                    )
+
+    except KeyboardInterrupt:
+        print(f"\n\n⚠ Interrupted! Saving partial results to {args.output}...")
+
+    finally:
+        csv_file.close()
+
+    # Summary
+    print("\n" + "=" * 80)
+    print("  SWEEP COMPLETE")
+    print("=" * 80)
+
+    # Read back results and compute statistics
+    results = []
+    with open(args.output, "r") as f:
+        reader = csv.DictReader(f)
+        results = list(reader)
+
+    print(f"\n  Total tests:     {len(results)}")
+    print(f"  Output file:     {args.output}")
+
+    if not args.dry_run:
+        success = [r for r in results if r["status"] == "SUCCESS"]
+        print(
+            f"  Successful:      {len(success)} ({100 * len(success) / len(results):.1f}%)"
+        )
+
+        if success:
+            avg_tflops = np.mean([float(r["actual_tflops"]) for r in success])
+            max_tflops = max([float(r["actual_tflops"]) for r in success])
+            print(f"  Avg TFLOPS:      {avg_tflops:.2f}")
+            print(f"  Max TFLOPS:      {max_tflops:.2f}")
+
+            # Per-dtype breakdown
+            for dtype in args.dtypes:
+                dtype_results = [r for r in success if r["dtype"] == dtype]
+                if dtype_results:
+                    avg = np.mean([float(r["actual_tflops"]) for r in dtype_results])
+                    print(
+                        f"    {dtype:4s}:          {avg:.2f} TFLOPS (n={len(dtype_results)})"
+                    )
+
+    print("=" * 80)
+    print()
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/feature_spec.json b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/feature_spec.json
new file mode 100644
index 0000000000..dc4ed02e5e
--- /dev/null
+++ b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/feature_spec.json
@@ -0,0 +1,113 @@
+{
+  "op_type": "gemm_universal",
+  "dtype": "fp16",
+  "arch": "gfx950",
+  "feature_names": [
+    "M",
+    "N",
+    "K",
+    "split_k",
+    "log2_M",
+    "log2_N",
+    "log2_K",
+    "log2_MNK",
+    "arithmetic_intensity",
+    "aspect_ratio_mn",
+    "aspect_ratio_mk",
+    "aspect_ratio_nk",
+    "layout",
+    "tile_m",
+    "tile_n",
+    "tile_k",
+    "warp_m",
+    "warp_n",
+    "warp_k",
+    "warp_tile_m",
+    "warp_tile_n",
+    "warp_tile_k",
+    "pipeline",
+    "scheduler",
+    "epilogue",
+    "pad_m",
+    "pad_n",
+    "pad_k",
+    "persistent",
+    "num_warps",
+    "tile_volume",
+    "tile_mn",
+    "lds_usage_estimate",
+    "lds_usage_ratio",
+    "num_tiles_m",
+    "num_tiles_n",
+    "num_tiles_k",
+    "total_output_tiles",
+    "tile_eff_m",
+    "tile_eff_n",
+    "tile_eff_k",
+    "overall_tile_efficiency",
+    "cu_utilization",
+    "ratio_M_to_tile_m",
+    "ratio_N_to_tile_n",
+    "ratio_K_to_tile_k",
+    "problem_smaller_than_tile_m",
+    "problem_smaller_than_tile_n",
+    "problem_smaller_than_tile_k",
+    "any_dim_too_small",
+    "needs_padding_m",
+    "needs_padding_n",
+    "needs_padding_k",
+    "has_padding_when_needed_m",
+    "has_padding_when_needed_n",
+    "has_padding_when_needed_k",
+    "missing_required_padding_m",
+    "missing_required_padding_n",
+    "missing_required_padding_k",
+    "missing_any_required_padding",
+    "hw_num_cus",
+    "hw_simds_per_cu",
+    "hw_total_simds",
+    "hw_shader_engines",
+    "hw_max_clock_mhz",
+    "hw_max_waves_per_cu",
+    "hw_wavefront_size",
+    "hw_lds_capacity",
+    "hw_l1_cache_kb",
+    "hw_l2_cache_kb",
+    "hw_l3_cache_kb",
+    "hw_num_xcd"
+  ],
+  "categorical_features": [
+    "layout",
+    "pipeline",
+    "scheduler",
+    "epilogue"
+  ],
+  "targets": [
+    "tflops",
+    "latency",
+    "bandwidth"
+  ],
+  "log_targets": [
+    "bandwidth",
+    "tflops"
+  ],
+  "params": {
+    "objective": "regression",
+    "metric": [
+      "rmse",
+      "mae"
+    ],
+    "num_leaves": 255,
+    "max_depth": 15,
+    "n_estimators": 2000,
+    "learning_rate": 0.02,
+    "min_child_samples": 10,
+    "subsample": 0.85,
+    "colsample_bytree": 0.85,
+    "reg_alpha": 0.05,
+    "reg_lambda": 0.5,
+    "verbose": -1,
+    "n_jobs": 8,
+    "seed": 42
+  }
+}
\ No newline at end of file
diff --git a/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm.gz b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm.gz
new file mode 100644
index 0000000000..a59cc73c4f
Binary files /dev/null and b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/model_tflops.lgbm.gz differ
diff --git a/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/train_manifest.json b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/train_manifest.json
new file mode 100644
index 0000000000..7028dc32fa
--- /dev/null
+++ b/dispatcher/heuristics/models/gemm_universal_fp16_gfx950/train_manifest.json
@@ -0,0 +1,10 @@
+{
+  "warm_start_from": null,
+  "prev_n_estimators": 0,
+  "new_n_estimators": 2000,
+  "total_n_estimators": 2000,
+  "data_rows": 25600,
+  "valid_rows": 21920,
+  "unique_shapes": 25,
+  "timestamp": "2026-03-20T05:00:55"
+}
\ No newline at end of file
diff --git a/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/feature_spec.json b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/feature_spec.json
new file mode 100644
index 0000000000..ffc4052d9b
--- /dev/null
+++ b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/feature_spec.json
@@ -0,0 +1,113 @@
+{
+  "op_type": "gemm_universal",
+  "dtype": "fp8",
+  "arch": "gfx950",
+  "feature_names": [
+    "M",
+    "N",
+    "K",
+    "split_k",
+    "log2_M",
+    "log2_N",
+    "log2_K",
+    "log2_MNK",
+    "arithmetic_intensity",
+    "aspect_ratio_mn",
+    "aspect_ratio_mk",
+    "aspect_ratio_nk",
+    "layout",
+    "tile_m",
+    "tile_n",
+    "tile_k",
+    "warp_m",
+    "warp_n",
+    "warp_k",
+    "warp_tile_m",
+    "warp_tile_n",
+    "warp_tile_k",
+    "pipeline",
+    "scheduler",
+    "epilogue",
+    "pad_m",
+    "pad_n",
+    "pad_k",
+    "persistent",
+    "num_warps",
+    "tile_volume",
+    "tile_mn",
+    "lds_usage_estimate",
+    "lds_usage_ratio",
+    "num_tiles_m",
+    "num_tiles_n",
+    "num_tiles_k",
+    "total_output_tiles",
+    "tile_eff_m",
+    "tile_eff_n",
+    "tile_eff_k",
+    "overall_tile_efficiency",
+    "cu_utilization",
+    "ratio_M_to_tile_m",
+    "ratio_N_to_tile_n",
+    "ratio_K_to_tile_k",
+    "problem_smaller_than_tile_m",
+    "problem_smaller_than_tile_n",
+    "problem_smaller_than_tile_k",
+    "any_dim_too_small",
+    "needs_padding_m",
+    "needs_padding_n",
+    "needs_padding_k",
+    "has_padding_when_needed_m",
+    "has_padding_when_needed_n",
+    "has_padding_when_needed_k",
+    "missing_required_padding_m",
+    "missing_required_padding_n",
+    "missing_required_padding_k",
+    "missing_any_required_padding",
+    "hw_num_cus",
+    "hw_simds_per_cu",
+    "hw_total_simds",
+    "hw_shader_engines",
+    "hw_max_clock_mhz",
+    "hw_max_waves_per_cu",
+    "hw_wavefront_size",
+    "hw_lds_capacity",
+    "hw_l1_cache_kb",
+    "hw_l2_cache_kb",
+    "hw_l3_cache_kb",
+    "hw_num_xcd"
+  ],
+  "categorical_features": [
+    "layout",
+    "pipeline",
+    "scheduler",
+    "epilogue"
+  ],
+  "targets": [
+    "tflops",
+    "latency",
+    "bandwidth"
+  ],
+  "log_targets": [
+    "bandwidth",
+    "tflops"
+  ],
+  "params": {
+    "objective": "regression",
+    "metric": [
+      "rmse",
+      "mae"
+    ],
+    "num_leaves": 255,
+    "max_depth": 15,
+    "n_estimators": 2000,
+    "learning_rate": 0.02,
+    "min_child_samples": 10,
+    "subsample": 0.85,
+    "colsample_bytree": 0.85,
+    "reg_alpha": 0.05,
+    "reg_lambda": 0.5,
+    "verbose": -1,
+    "n_jobs": 8,
+    "seed": 42
+  }
+}
\ No newline at end of file
diff --git a/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/model_tflops.lgbm.gz b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/model_tflops.lgbm.gz
new file mode 100644
index 0000000000..a2a08ee01a
Binary files /dev/null and b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/model_tflops.lgbm.gz differ
diff --git a/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/train_manifest.json b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/train_manifest.json
new file mode 100644
index 0000000000..d7ce61d2ff
--- /dev/null
+++ b/dispatcher/heuristics/models/gemm_universal_fp8_gfx950/train_manifest.json
@@ -0,0 +1,10 @@
+{
+  "warm_start_from": null,
+  "prev_n_estimators": 0,
+  "new_n_estimators": 2000,
+  "total_n_estimators": 2000,
+  "data_rows": 1296528,
+  "valid_rows": 1253076,
+  "unique_shapes": 168,
+  "timestamp": "2026-03-19T06:10:29"
+}
\ No newline at end of file
diff --git a/dispatcher/heuristics/predict.py b/dispatcher/heuristics/predict.py
new file mode 100644
index 0000000000..8738c76f23
--- /dev/null
+++ b/dispatcher/heuristics/predict.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Predictor for CK Tile kernel performance.
+
+Loads trained LightGBM models and provides:
+  - predict_tflops(): predicted TFLOPS for a single (problem, kernel) pair
+  - predict_latency(): predicted latency in ms
+  - predict_bandwidth(): predicted bandwidth in GB/s
+  - predict_all(): all three predictions at once
+  - rank_kernels(): rank all candidate kernels by predicted TFLOPS
+  - select_best(): return the best kernel ID
+
+Usage:
+    predictor = Predictor("models/gemm_universal_fp8_gfx950")
+    best_kernel = predictor.select_best(
+        problem={"m": 128, "n": 1536, "k": 7168, "dtype": "fp8", "layout": "rcr"},
+        kernel_configs=[...],
+    )
+"""
+
+import gzip
+import json
+from pathlib import Path
+from typing import Optional
+
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+
+from feature_engine import GemmUniversalFeatureEngine
+
+
+class Predictor:
+    """Loads trained models and feature spec for kernel performance prediction.
+
+    Parameters
+    ----------
+    model_dir : str or Path
+        Directory containing model artifacts:
+        - model_tflops.lgbm (required)
+        - model_latency.lgbm (optional)
+        - model_bandwidth.lgbm (optional)
+        - feature_spec.json (required)
+
+    feature_engine : FeatureEngine, optional
+        Override the feature engine. If None, constructs one from feature_spec.json.
+    """
+
+    def __init__(self, model_dir: str | Path, feature_engine=None):
+        self._model_dir = Path(model_dir)
+        self._models: dict[str, lgb.Booster] = {}
+
+        spec_path = self._model_dir / "feature_spec.json"
+        if spec_path.exists():
+            with open(spec_path) as f:
+                self._spec = json.load(f)
+        else:
+            self._spec = {}
+
+        self._log_targets = set(self._spec.get("log_targets", []))
+
+        if feature_engine is not None:
+            self._feature_engine = feature_engine
+        else:
+            self._feature_engine = GemmUniversalFeatureEngine()
+
+    def _load_model(self, target: str) -> Optional[lgb.Booster]:
+        """Lazy-load a model for the given target.
+
+        Automatically decompresses .lgbm.gz files if the .lgbm file doesn't exist.
+        The decompressed file is cached to disk for subsequent loads.
+        """
+        if target in self._models:
+            return self._models[target]
+
+        path = self._model_dir / f"model_{target}.lgbm"
+        gz_path = self._model_dir / f"model_{target}.lgbm.gz"
+
+        # Auto-decompress if needed
+        if not path.exists() and gz_path.exists():
+            with gzip.open(gz_path, 'rb') as f_in:
+                with open(path, 'wb') as f_out:
+                    f_out.write(f_in.read())
+
+        if not path.exists():
+            return None
+
+        model = lgb.Booster(model_file=str(path))
+        self._models[target] = model
+        return model
+
+    def _predict_single(self, target: str, problem: dict, kernel_config: dict) -> float:
+        """Predict a single target value, applying inverse log transform if needed."""
+        model = self._load_model(target)
+        if model is None:
+            raise FileNotFoundError(f"No model_{target}.lgbm in {self._model_dir}")
+        features = self._feature_engine.extract(problem, kernel_config)
+        raw = float(model.predict(features.reshape(1, -1))[0])
+        if target in self._log_targets:
+            return float(np.expm1(raw))
+        # Clamp to non-negative even for non-log models
+        return float(max(0.0, raw))
+
+    def predict_tflops(self, problem: dict, kernel_config: dict) -> float:
+        """Predict TFLOPS for a single (problem, kernel) pair.
+
+        Returns a real TFLOPS estimate (interpretable, usable as DE surrogate).
+        If the model was trained in log-space, the inverse transform is applied
+        automatically.
+        """
+        return self._predict_single("tflops", problem, kernel_config)
+
+    def predict_latency(self, problem: dict, kernel_config: dict) -> float:
+        """Predict latency in milliseconds for a single (problem, kernel) pair."""
+        return self._predict_single("latency", problem, kernel_config)
+
+    def predict_bandwidth(self, problem: dict, kernel_config: dict) -> float:
+        """Predict bandwidth in GB/s for a single (problem, kernel) pair."""
+        return self._predict_single("bandwidth", problem, kernel_config)
+
+    def predict_all(self, problem: dict, kernel_config: dict) -> dict[str, float]:
+        """Predict all available targets for a single (problem, kernel) pair.
+
+        Returns dict with keys 'tflops', 'latency_ms', 'bandwidth_gb_s' (if models exist).
+
+        Note: Applies inverse log transform for targets in log_targets and clamps
+        negatives to 0.0, consistent with _predict_single().
+        """
+        features = self._feature_engine.extract(problem, kernel_config).reshape(1, -1)
+        result = {}
+        for target, key in [
+            ("tflops", "tflops"),
+            ("latency", "latency_ms"),
+            ("bandwidth", "bandwidth_gb_s"),
+        ]:
+            model = self._load_model(target)
+            if model is not None:
+                raw = float(model.predict(features)[0])
+                # Apply inverse log transform if model was trained in log-space
+                if target in self._log_targets:
+                    result[key] = float(np.expm1(raw))
+                else:
+                    # Clamp to non-negative even for non-log models
+                    result[key] = float(max(0.0, raw))
+        return result
+
+    def rank_kernels(
+        self, problem: dict, kernel_configs: list[dict]
+    ) -> list[tuple[str, float]]:
+        """Rank candidate kernels by predicted TFLOPS (descending).
+
+        Parameters
+        ----------
+        problem : dict
+            Problem specification with keys: m, n, k, dtype, layout, split_k.
+        kernel_configs : list of dict
+            Each dict must have a 'kernel_name' key plus kernel parameters.
+
+        Returns
+        -------
+        list of (kernel_name, predicted_tflops) tuples, sorted descending.
+        """
+        if not kernel_configs:
+            return []
+
+        model = self._load_model("tflops")
+        if model is None:
+            raise FileNotFoundError(f"No model_tflops.lgbm in {self._model_dir}")
+
+        rows = []
+        for kc in kernel_configs:
+            merged = {**problem, **kc}
+            rows.append(merged)
+
+        df = pd.DataFrame(rows)
+        X = self._feature_engine.extract_batch(df)
+        preds = model.predict(X)
+        if "tflops" in self._log_targets:
+            preds = np.expm1(preds)
+
+        results = []
+        for i, kc in enumerate(kernel_configs):
+            name = kc.get("kernel_name", f"kernel_{i}")
+            results.append((name, float(preds[i])))
+
+        results.sort(key=lambda x: -x[1])
+        return results
+
+    def select_best(self, problem: dict, kernel_configs: list[dict]) -> str:
+        """Return the kernel_name of the best predicted kernel."""
+        ranked = self.rank_kernels(problem, kernel_configs)
+        if not ranked:
+            raise ValueError("No kernel configs provided")
+        return ranked[0][0]
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Predict kernel performance")
+    parser.add_argument(
+        "--model_dir", required=True, help="Directory with trained models"
+    )
+    parser.add_argument("--m", type=int, required=True)
+    parser.add_argument("--n", type=int, required=True)
+    parser.add_argument("--k", type=int, required=True)
+    parser.add_argument("--layout", default="rcr")
+    parser.add_argument("--dtype", default="fp8")
+    args = parser.parse_args()
+
+    predictor = Predictor(args.model_dir)
+    problem = {
+        "m": args.m,
+        "n": args.n,
+        "k": args.k,
+        "dtype": args.dtype,
+        "layout": args.layout,
+        "split_k": 1,
+    }
+
+    print(f"Loading models from {args.model_dir}...")
+    print(
+        f"Problem: M={args.m} N={args.n} K={args.k} dtype={args.dtype} layout={args.layout}"
+    )
+
+    data_dir = Path(args.model_dir).parent.parent / "data"
+    if data_dir.exists():
+        for pq in data_dir.glob("*.parquet"):
+            df = pd.read_parquet(pq)
+            kernel_names = df["kernel_name"].unique()
+            configs = []
+            for kn in kernel_names[:10]:
+                row = df[df["kernel_name"] == kn].iloc[0]
+                configs.append(row.to_dict())
+            if configs:
+                ranked = predictor.rank_kernels(problem, configs)
+                print(f"\nTop 5 kernels (from {len(configs)} candidates):")
+                for name, tflops in ranked[:5]:
+                    print(f"  {tflops:8.2f} TFLOPS  {name}")
+                break
diff --git a/dispatcher/heuristics/search.py b/dispatcher/heuristics/search.py
new file mode 100644
index 0000000000..f9b7e13b09
--- /dev/null
+++ b/dispatcher/heuristics/search.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Surrogate search for CK Tile kernel configuration optimization.
+
+Uses a trained LGBMRegressor as a cheap surrogate function to search the
+discrete kernel parameter space (tile sizes, warp config, pipeline, etc.)
+without running actual GPU benchmarks.
+
+Strategies:
+  - 'random': Sample N random valid configs, score all, return top-K.
+  - 'de': Discrete Differential Evolution with mutation over valid parameter choices.
+
+Usage:
+    from search import SurrogateSearch
+    from predict import Predictor
+
+    predictor = Predictor("models/gemm_universal_fp8_gfx950")
+    searcher = SurrogateSearch(predictor, strategy='random')
+    results = searcher.search(
+        problem={"m": 128, "n": 1536, "k": 7168, "dtype": "fp8", "layout": "rcr"},
+        budget=500,
+    )
+    # results: [(config_dict, predicted_tflops), ...] sorted descending
+"""
+
+import random
+from typing import Optional
+
+import numpy as np
+
+from feature_engine import GemmUniversalFeatureEngine
+from predict import Predictor
+
+
+class SurrogateSearch:
+    """Search kernel parameter space using ML regressor as surrogate objective.
+
+    Parameters
+    ----------
+    predictor : Predictor
+        Trained predictor with a TFLOPS model.
+    feature_engine : GemmUniversalFeatureEngine, optional
+        Feature engine for parameter space and validation. If None, uses default.
+    strategy : str
+        Search strategy: 'random' or 'de' (Discrete Differential Evolution).
+    seed : int
+        Random seed for reproducibility.
+    """
+
+    def __init__(
+        self,
+        predictor: Predictor,
+        feature_engine: Optional[GemmUniversalFeatureEngine] = None,
+        strategy: str = "random",
+        seed: int = 42,
+    ):
+        self._predictor = predictor
+        self._fe = feature_engine or GemmUniversalFeatureEngine()
+        self._strategy = strategy
+        self._rng = random.Random(seed)
+        self._np_rng = np.random.RandomState(seed)
+        self._param_space = self._fe.get_parameter_space()
+
+    def _sample_random_config(self) -> dict:
+        """Sample a single random config from the parameter space."""
+        config = {}
+        for param, values in self._param_space.items():
+            config[param] = self._rng.choice(values)
+        return config
+
+    def _sample_valid_config(self, max_attempts: int = 50) -> Optional[dict]:
+        """Sample a random config that passes all validation constraints."""
+        for _ in range(max_attempts):
+            config = self._sample_random_config()
+            if self._fe.validate_config(config):
+                return config
+        return None
+
+    def _score_config(self, problem: dict, config: dict) -> float:
+        """Score a config using the predictor."""
+        return self._predictor.predict_tflops(problem, config)
+
+    def _search_random(
+        self, problem: dict, budget: int, top_k: int
+    ) -> list[tuple[dict, float]]:
+        """Random search: sample valid configs, score all, return top-K."""
+        configs = []
+        for _ in range(budget):
+            cfg = self._sample_valid_config()
+            if cfg is not None:
+                configs.append(cfg)
+
+        if not configs:
+            return []
+
+        scored = []
+        for cfg in configs:
+            try:
+                score = self._score_config(problem, cfg)
+                scored.append((cfg, score))
+            except Exception:
+                continue
+
+        scored.sort(key=lambda x: -x[1])
+        return scored[:top_k]
+
+    def _search_de(
+        self,
+        problem: dict,
+        budget: int,
+        top_k: int,
+        pop_size: int = 20,
+        mutation_rate: float = 0.3,
+        crossover_rate: float = 0.7,
+    ) -> list[tuple[dict, float]]:
+        """Discrete Differential Evolution.
+
+        Uses discrete mutation: randomly swap parameters to other valid values
+        from the parameter space (no continuous relaxation + snap).
+
+        Each generation:
+          1. For each member of the population, create a trial vector by:
+             - Selecting 3 random other members (a, b, c)
+             - For each parameter, with probability mutation_rate, take the value
+               from a, b, or c (uniform choice among the three donors)
+             - With probability crossover_rate, take the trial value; otherwise keep original
+          2. Validate the trial; if invalid, resample that parameter from the space
+          3. Score the trial; if better than parent, replace
+        """
+        param_names = list(self._param_space.keys())
+
+        population = []
+        for _ in range(pop_size):
+            cfg = self._sample_valid_config()
+            if cfg is not None:
+                score = self._score_config(problem, cfg)
+                population.append((cfg, score))
+
+        if len(population) < 4:
+            return self._search_random(problem, budget, top_k)
+
+        evals_used = len(population)
+        max_gens = (budget - evals_used) // pop_size
+
+        for gen in range(max_gens):
+            new_pop = []
+            for i, (parent, parent_score) in enumerate(population):
+                candidates = [j for j in range(len(population)) if j != i]
+                if len(candidates) < 3:
+                    new_pop.append((parent, parent_score))
+                    continue
+
+                a_idx, b_idx, c_idx = self._rng.sample(candidates, 3)
+                a, b, c = (
+                    population[a_idx][0],
+                    population[b_idx][0],
+                    population[c_idx][0],
+                )
+
+                trial = dict(parent)
+                for param in param_names:
+                    if self._rng.random() < mutation_rate:
+                        donor = self._rng.choice([a, b, c])
+                        trial[param] = donor.get(param, parent.get(param))
+
+                    if self._rng.random() > crossover_rate:
+                        trial[param] = parent.get(param)
+
+                if not self._fe.validate_config(trial):
+                    for param in param_names:
+                        if param in trial and trial[param] not in self._param_space.get(
+                            param, [trial[param]]
+                        ):
+                            trial[param] = self._rng.choice(self._param_space[param])
+                    if not self._fe.validate_config(trial):
+                        new_pop.append((parent, parent_score))
+                        continue
+
+                try:
+                    trial_score = self._score_config(problem, trial)
+                    evals_used += 1
+                except Exception:
+                    new_pop.append((parent, parent_score))
+                    continue
+
+                if trial_score > parent_score:
+                    new_pop.append((trial, trial_score))
+                else:
+                    new_pop.append((parent, parent_score))
+
+            population = new_pop
+
+        population.sort(key=lambda x: -x[1])
+        return population[:top_k]
+
+    def search(
+        self,
+        problem: dict,
+        budget: int = 500,
+        top_k: int = 10,
+        **kwargs,
+    ) -> list[tuple[dict, float]]:
+        """Search the kernel parameter space for the best configuration.
+
+        Parameters
+        ----------
+        problem : dict
+            Problem specification: m, n, k, dtype, layout, split_k.
+        budget : int
+            Maximum number of surrogate evaluations.
+        top_k : int
+            Number of top configurations to return.
+        **kwargs
+            Strategy-specific parameters (pop_size, mutation_rate, etc.).
+
+        Returns
+        -------
+        list of (config_dict, predicted_tflops), sorted descending by TFLOPS.
+        """
+        if self._strategy == "random":
+            return self._search_random(problem, budget, top_k)
+        elif self._strategy == "de":
+            return self._search_de(problem, budget, top_k, **kwargs)
+        else:
+            raise ValueError(f"Unknown strategy: {self._strategy}")
+
+
+if __name__ == "__main__":
+    import argparse
+    import time
+
+    parser = argparse.ArgumentParser(
+        description="Surrogate search for optimal kernel config"
+    )
+    parser.add_argument("--model_dir", required=True)
+    parser.add_argument("--m", type=int, required=True)
+    parser.add_argument("--n", type=int, required=True)
+    parser.add_argument("--k", type=int, required=True)
+    parser.add_argument("--dtype", default="fp8")
+    parser.add_argument("--layout", default="rcr")
+    parser.add_argument("--strategy", default="random", choices=["random", "de"])
+    parser.add_argument("--budget", type=int, default=500)
+    parser.add_argument("--top_k", type=int, default=10)
+    args = parser.parse_args()
+
+    predictor = Predictor(args.model_dir)
+    searcher = SurrogateSearch(predictor, strategy=args.strategy)
+    problem = {
+        "m": args.m,
+        "n": args.n,
+        "k": args.k,
+        "dtype": args.dtype,
+        "layout": args.layout,
+        "split_k": 1,
+    }
+
+    print(f"Searching with strategy={args.strategy}, budget={args.budget}...")
+    t0 = time.time()
+    results = searcher.search(problem, budget=args.budget, top_k=args.top_k)
+    elapsed = time.time() - t0
+
+    print(f"\nTop {len(results)} configs found in {elapsed * 1000:.1f}ms:")
+    for i, (cfg, tflops) in enumerate(results):
+        tile_str = f"{cfg.get('tile_m', '?')}x{cfg.get('tile_n', '?')}x{cfg.get('tile_k', '?')}"
+        warp_str = f"{cfg.get('warp_m', '?')}x{cfg.get('warp_n', '?')}x{cfg.get('warp_k', '?')}"
+        print(
+            f"  #{i + 1}: {tflops:8.2f} TFLOPS  tile={tile_str} warp={warp_str} "
+            f"pipeline={cfg.get('pipeline', '?')} scheduler={cfg.get('scheduler', '?')}"
+        )
diff --git a/dispatcher/heuristics/tests/__init__.py b/dispatcher/heuristics/tests/__init__.py
new file mode 100644
index 0000000000..1df4857184
--- /dev/null
+++ b/dispatcher/heuristics/tests/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
diff --git a/dispatcher/heuristics/tests/test_data_pipeline.py b/dispatcher/heuristics/tests/test_data_pipeline.py
new file mode 100644
index 0000000000..d643138693
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_data_pipeline.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for data_pipeline.py.
+
+Covers: kernel name parsing, layout derivation, streaming log parsing,
+schema validation, and corner cases (empty logs, malformed JSON, single-shape).
+"""
+
+import tempfile
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+import sys
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from data_pipeline import (
+    parse_kernel_name,
+    _layout_from_problem,
+    parse_streaming_log,
+    save_parquet,
+    load_parquet,
+    CANONICAL_COLUMNS,
+)
+
+
+# ---------------------------------------------------------------------------
+# parse_kernel_name
+# ---------------------------------------------------------------------------
+
+
+class TestParseKernelName:
+    def test_standard_name(self):
+        name = "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128"
+        result = parse_kernel_name(name)
+        assert result["dtype"] == "fp8"
+        assert result["layout"] == "rcr"
+        assert result["pipeline"] == "compv3"
+        assert result["epilogue"] == "cshuffle"
+        assert result["scheduler"] == "intrawave"
+        assert result["pad_m"] is False
+        assert result["pad_n"] is False
+        assert result["pad_k"] is False
+        assert result["persistent"] is False
+        assert result["tile_m"] == 128
+        assert result["tile_n"] == 128
+        assert result["tile_k"] == 128
+        assert result["warp_m"] == 1
+        assert result["warp_n"] == 4
+        assert result["warp_k"] == 1
+        assert result["warp_tile_m"] == 16
+        assert result["warp_tile_n"] == 16
+        assert result["warp_tile_k"] == 128
+
+    def test_with_padding_and_persistent(self):
+        name = "gemm_universal_fp16_rrr_compv4_default_interwave_True_True_True_True_256x256x64_2x2x1_32x32x16"
+        result = parse_kernel_name(name)
+        assert result["dtype"] == "fp16"
+        assert result["layout"] == "rrr"
+        assert result["pad_m"] is True
+        assert result["pad_n"] is True
+        assert result["pad_k"] is True
+        assert result["persistent"] is True
+        assert result["tile_m"] == 256
+
+    def test_empty_name(self):
+        assert parse_kernel_name("") == {}
+
+    def test_malformed_name(self):
+        assert parse_kernel_name("not_a_kernel_name") == {}
+
+    def test_partial_name(self):
+        result = parse_kernel_name("gemm_universal_fp8_rcr_compv3")
+        assert result.get("dtype") == "fp8"
+        assert result.get("layout") == "rcr"
+        assert "tile_m" not in result  # not enough parts
+
+    def test_all_layouts(self):
+        for layout in ["rcr", "rrr", "crr", "ccr"]:
+            name = f"gemm_universal_fp8_{layout}_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128"
+            result = parse_kernel_name(name)
+            assert result["layout"] == layout
+
+
+# ---------------------------------------------------------------------------
+# _layout_from_problem
+# ---------------------------------------------------------------------------
+
+
+class TestLayoutFromProblem:
+    def test_rcr(self):
+        assert (
+            _layout_from_problem(
+                {
+                    "layout_a": "RowMajor",
+                    "layout_b": "ColumnMajor",
+                    "layout_c": "RowMajor",
+                }
+            )
+            == "rcr"
+        )
+
+    def test_rrr(self):
+        assert (
+            _layout_from_problem(
+                {"layout_a": "RowMajor", "layout_b": "RowMajor", "layout_c": "RowMajor"}
+            )
+            == "rrr"
+        )
+
+    def test_empty(self):
+        assert _layout_from_problem({}) == "???"
+
+    def test_case_insensitive(self):
+        assert (
+            _layout_from_problem(
+                {
+                    "layout_a": "rowmajor",
+                    "layout_b": "COLUMNMAJOR",
+                    "layout_c": "RowMajor",
+                }
+            )
+            == "rcr"
+        )
+
+
+# ---------------------------------------------------------------------------
+# parse_streaming_log
+# ---------------------------------------------------------------------------
+
+SAMPLE_LOG = """\
+================================================================================
+LOG FILE: test.log
+================================================================================
+CK Tile Profiling Run
+GPU ID: 0
+
+--- Running CK Tile benchmarks on GPU 0 ---
+
+========================================
+Shape 1: M=16 N=1536 K=7168 dtype=fp8 layout=rcr
+========================================
+Found 2 kernels
+{
+ "name": "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128",
+ "problem": {
+   "split_k":1,
+   "m":16,
+   "n":1536,
+   "k":7168,
+   "stride_a":7168,
+   "stride_b":7168,
+   "stride_c":1536,
+   "dtype_a":"fp8",
+   "dtype_b":"fp8",
+   "dtype_acc":"fp32",
+   "dtype_c":"fp16",
+   "layout_a":"RowMajor",
+   "layout_b":"ColumnMajor",
+   "layout_c":"RowMajor",
+   "structured_sparsity":false
+},
+ "perf_result": {
+   "latency(ms)": 0.04,
+   "tflops(TFlops)": 8.81,
+   "bandwidth(GB/s)": 279.51
+}
+}
+{
+ "name": "gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_128x128x64_2x2x1_32x32x16",
+ "problem": {
+   "split_k":1,
+   "m":16,
+   "n":1536,
+   "k":7168,
+   "stride_a":7168,
+   "stride_b":7168,
+   "stride_c":1536,
+   "dtype_a":"fp8",
+   "dtype_b":"fp8",
+   "dtype_acc":"fp32",
+   "dtype_c":"fp16",
+   "layout_a":"RowMajor",
+   "layout_b":"ColumnMajor",
+   "layout_c":"RowMajor",
+   "structured_sparsity":false
+},
+ "perf_result": {
+   "latency(ms)": 0.05,
+   "tflops(TFlops)": 7.22,
+   "bandwidth(GB/s)": 228.85
+}
+}
+
+========================================
+Shape 2: M=20480 N=7168 K=256 dtype=fp8 layout=rcr
+========================================
+Found 1 kernels
+{
+ "name": "gemm_universal_fp8_rcr_mem_cshuffle_intrawave_False_False_False_True_64x64x128_1x4x1_16x16x32",
+ "problem": {
+   "split_k":1,
+   "m":20480,
+   "n":7168,
+   "k":256,
+   "stride_a":256,
+   "stride_b":256,
+   "stride_c":7168,
+   "dtype_a":"fp8",
+   "dtype_b":"fp8",
+   "dtype_acc":"fp32",
+   "dtype_c":"fp16",
+   "layout_a":"RowMajor",
+   "layout_b":"ColumnMajor",
+   "layout_c":"RowMajor",
+   "structured_sparsity":false
+},
+ "perf_result": {
+   "latency(ms)": 0.15,
+   "tflops(TFlops)": 505.00,
+   "bandwidth(GB/s)": 1200.50
+}
+}
+"""
+
+
+class TestParseStreamingLog:
+    def _write_log(self, content: str) -> Path:
+        f = tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False)
+        f.write(content)
+        f.close()
+        return Path(f.name)
+
+    def test_basic_parse(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path, arch="gfx950")
+        assert len(df) == 3
+        assert df["arch"].iloc[0] == "gfx950"
+        assert df["m"].tolist() == [16, 16, 20480]
+        assert df["n"].tolist() == [1536, 1536, 7168]
+        assert df["k"].tolist() == [7168, 7168, 256]
+
+    def test_tflops_values(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path)
+        assert df["measured_tflops"].tolist() == pytest.approx([8.81, 7.22, 505.0])
+
+    def test_kernel_config_parsed(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path)
+        assert df["tile_m"].iloc[0] == 128
+        assert df["pipeline"].iloc[0] == "compv3"
+        assert df["pipeline"].iloc[1] == "compv4"
+
+    def test_layout_derived_from_json(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path)
+        assert all(df["layout"] == "rcr")
+
+    def test_empty_log(self):
+        path = self._write_log("No shapes here\nJust noise\n")
+        df = parse_streaming_log(path)
+        assert len(df) == 0
+        for col in CANONICAL_COLUMNS:
+            assert col in df.columns
+
+    def test_single_kernel(self):
+        log = """\
+Shape 1: M=1 N=1 K=1 dtype=fp8 layout=rcr
+{
+ "name": "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128",
+ "problem": {"split_k":1, "m":1, "n":1, "k":1, "dtype_a":"fp8", "dtype_b":"fp8", "layout_a":"RowMajor", "layout_b":"ColumnMajor", "layout_c":"RowMajor"},
+ "perf_result": {"latency(ms)": 0.001, "tflops(TFlops)": 0.002, "bandwidth(GB/s)": 0.01}
+}
+"""
+        path = self._write_log(log)
+        df = parse_streaming_log(path)
+        assert len(df) == 1
+        assert df["m"].iloc[0] == 1
+        assert bool(df["is_valid"].iloc[0]) is True
+
+    def test_zero_tflops_marked_invalid(self):
+        log = """\
+Shape 1: M=16 N=16 K=16 dtype=fp8 layout=rcr
+{
+ "name": "test_kernel",
+ "problem": {"split_k":1, "m":16, "n":16, "k":16, "dtype_a":"fp8"},
+ "perf_result": {"latency(ms)": 0.0, "tflops(TFlops)": 0.0, "bandwidth(GB/s)": 0.0}
+}
+"""
+        path = self._write_log(log)
+        df = parse_streaming_log(path)
+        assert len(df) == 1
+        assert bool(df["is_valid"].iloc[0]) is False
+
+    def test_malformed_json_skipped(self):
+        log = """\
+Shape 1: M=16 N=16 K=16 dtype=fp8 layout=rcr
+{
+ "name": "good_kernel",
+ "problem": {"split_k":1, "m":16, "n":16, "k":16, "dtype_a":"fp8"},
+ "perf_result": {"latency(ms)": 0.01, "tflops(TFlops)": 1.0, "bandwidth(GB/s)": 10.0}
+}
+{ this is not valid json }
+{
+ "name": "another_good",
+ "problem": {"split_k":1, "m":16, "n":16, "k":16, "dtype_a":"fp8"},
+ "perf_result": {"latency(ms)": 0.02, "tflops(TFlops)": 2.0, "bandwidth(GB/s)": 20.0}
+}
+"""
+        path = self._write_log(log)
+        df = parse_streaming_log(path)
+        assert len(df) == 2
+
+    def test_extreme_shapes(self):
+        """Tiny M=1 (single token) and very large M=20480."""
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path)
+        assert 1 not in df["m"].values  # sample has M=16, M=20480
+        assert 16 in df["m"].values
+        assert 20480 in df["m"].values
+
+    def test_run_id_assigned(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path, run_id="test_run_123")
+        assert all(df["run_id"] == "test_run_123")
+
+    def test_op_type_assigned(self):
+        path = self._write_log(SAMPLE_LOG)
+        df = parse_streaming_log(path, op_type="gemm_streamk")
+        assert all(df["op_type"] == "gemm_streamk")
+
+
+# ---------------------------------------------------------------------------
+# Parquet round-trip
+# ---------------------------------------------------------------------------
+
+
+class TestParquetIO:
+    def test_round_trip(self, tmp_path):
+        df = pd.DataFrame(
+            {
+                "m": [16, 32],
+                "n": [1536, 1536],
+                "k": [7168, 7168],
+                "measured_tflops": [8.81, 15.0],
+            }
+        )
+        path = tmp_path / "test.parquet"
+        save_parquet(df, path)
+        loaded = load_parquet(path)
+        assert len(loaded) == 2
+        assert loaded["m"].tolist() == [16, 32]
+
+    def test_creates_parent_dirs(self, tmp_path):
+        path = tmp_path / "sub" / "dir" / "test.parquet"
+        df = pd.DataFrame({"x": [1]})
+        save_parquet(df, path)
+        assert path.exists()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_dispatcher_integration.py b/dispatcher/heuristics/tests/test_dispatcher_integration.py
new file mode 100644
index 0000000000..a80438629d
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_dispatcher_integration.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for dispatcher_integration.py.
+
+Covers: kernel name parsing to feature dict, feature dict to dispatcher config
+(name mapping inversion), MLKernelSpec creation, binary pool loading, and
+the ML heuristic function.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from dispatcher_integration import (
+    kernel_config_to_feature_dict,
+    feature_dict_to_dispatcher_config,
+    feature_dict_to_ml_spec,
+    ml_spec_to_dispatcher_config,
+    create_ml_heuristic,
+    load_kernel_pool_from_binaries,
+    MLKernelSpec,
+    LAYOUT_TO_DISPATCHER,
+)
+from feature_engine import GemmUniversalFeatureEngine
+
+
+SAMPLE_KERNEL_NAME = (
+    "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave"
+    "_False_False_False_False_128x128x128_1x4x1_16x16x128"
+)
+
+
+# ---------------------------------------------------------------------------
+# kernel_config_to_feature_dict
+# ---------------------------------------------------------------------------
+
+
+class TestKernelConfigToFeatureDict:
+    def test_parses_standard_name(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        assert feat["tile_m"] == 128
+        assert feat["tile_n"] == 128
+        assert feat["tile_k"] == 128
+        assert feat["warp_m"] == 1  # warps per block
+        assert feat["warp_n"] == 4
+        assert feat["warp_k"] == 1
+        assert feat["warp_tile_m"] == 16
+        assert feat["warp_tile_n"] == 16
+        assert feat["warp_tile_k"] == 128
+        assert feat["pipeline"] == "compv3"
+        assert feat["scheduler"] == "intrawave"
+        assert feat["epilogue"] == "cshuffle"
+        assert feat["kernel_name"] == SAMPLE_KERNEL_NAME
+
+    def test_empty_name_returns_empty(self):
+        assert kernel_config_to_feature_dict("") == {}
+
+    def test_invalid_name_returns_empty(self):
+        assert kernel_config_to_feature_dict("not_a_kernel") == {}
+
+
+# ---------------------------------------------------------------------------
+# Name mapping: feature dict <-> dispatcher config
+# ---------------------------------------------------------------------------
+
+
+class TestNameMapping:
+    """The critical inversion: feature engine warp_m/n/k (warps per block)
+    maps to dispatcher wave_m/n/k, and feature engine warp_tile_m/n/k
+    maps to dispatcher warp_m/n/k."""
+
+    def test_warp_to_wave_mapping(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat)
+        assert disp["wave_m"] == feat["warp_m"]  # 1
+        assert disp["wave_n"] == feat["warp_n"]  # 4
+        assert disp["wave_k"] == feat["warp_k"]  # 1
+
+    def test_warp_tile_to_warp_mapping(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat)
+        assert disp["warp_m"] == feat["warp_tile_m"]  # 16
+        assert disp["warp_n"] == feat["warp_tile_n"]  # 16
+        assert disp["warp_k"] == feat["warp_tile_k"]  # 128
+
+    def test_tile_dims_pass_through(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat)
+        assert disp["tile_m"] == 128
+        assert disp["tile_n"] == 128
+        assert disp["tile_k"] == 128
+
+    def test_pipeline_passes_through(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat)
+        assert disp["pipeline"] == "compv3"
+        assert disp["scheduler"] == "intrawave"
+        assert disp["epilogue"] == "cshuffle"
+
+    def test_rcr_layout_mapping(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        disp = feature_dict_to_dispatcher_config(feat, dtype="fp8")
+        assert disp["layout_a"] == "row"
+        assert disp["layout_b"] == "col"
+        assert disp["layout_c"] == "row"
+
+    def test_all_layouts(self):
+        for layout, (la, lb, lc) in LAYOUT_TO_DISPATCHER.items():
+            feat = {"layout": layout, "tile_m": 128}
+            disp = feature_dict_to_dispatcher_config(feat)
+            assert disp["layout_a"] == la
+            assert disp["layout_b"] == lb
+            assert disp["layout_c"] == lc
+
+
+# ---------------------------------------------------------------------------
+# MLKernelSpec
+# ---------------------------------------------------------------------------
+
+
+class TestMLKernelSpec:
+    def test_from_feature_dict(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        spec = feature_dict_to_ml_spec(feat, predicted_tflops=123.4)
+        assert spec.kernel_name == SAMPLE_KERNEL_NAME
+        assert spec.predicted_tflops == 123.4
+        assert spec.tile_m == 128
+        assert spec.wave_m == 1  # was warp_m in feature space
+        assert spec.warp_m == 16  # was warp_tile_m in feature space
+
+    def test_spec_to_dispatcher_config(self):
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        spec = feature_dict_to_ml_spec(feat, 100.0)
+        disp = ml_spec_to_dispatcher_config(spec, dtype="fp8", arch="gfx950")
+        assert disp["tile_m"] == 128
+        assert disp["wave_m"] == 1
+        assert disp["warp_m"] == 16
+        assert disp["gfx_arch"] == "gfx950"
+        assert disp["dtype_a"] == "fp8"
+
+    def test_roundtrip_preserves_values(self):
+        """feature_dict -> MLKernelSpec -> dispatcher_config should be consistent."""
+        feat = kernel_config_to_feature_dict(SAMPLE_KERNEL_NAME)
+        spec = feature_dict_to_ml_spec(feat, 0.0)
+        disp_from_spec = ml_spec_to_dispatcher_config(spec)
+        disp_from_feat = feature_dict_to_dispatcher_config(feat)
+        for key in [
+            "tile_m",
+            "tile_n",
+            "tile_k",
+            "wave_m",
+            "wave_n",
+            "wave_k",
+            "warp_m",
+            "warp_n",
+            "warp_k",
+            "pipeline",
+            "scheduler",
+            "epilogue",
+        ]:
+            assert disp_from_spec[key] == disp_from_feat[key], f"Mismatch on {key}"
+
+
+# ---------------------------------------------------------------------------
+# Binary pool loading
+# ---------------------------------------------------------------------------
+
+
+class TestLoadKernelPool:
+    def test_loads_from_real_bin_dir(self):
+        bin_dir = Path("/workspace/ck_tile/bin")
+        if not bin_dir.exists():
+            pytest.skip("No /workspace/ck_tile/bin")
+        pool = load_kernel_pool_from_binaries(bin_dir)
+        assert len(pool) > 0
+        assert "tile_m" in pool[0]
+        assert "kernel_name" in pool[0]
+
+    def test_empty_dir_returns_empty(self, tmp_path):
+        pool = load_kernel_pool_from_binaries(tmp_path)
+        assert pool == []
+
+
+# ---------------------------------------------------------------------------
+# ML heuristic function
+# ---------------------------------------------------------------------------
+
+
+class TestCreateMLHeuristic:
+    @pytest.fixture
+    def mock_model_dir(self, tmp_path):
+        """Create a minimal model for testing the heuristic flow."""
+        fe = GemmUniversalFeatureEngine()
+        n_features = len(fe.get_feature_names())
+        np.random.seed(42)
+        X = np.random.rand(100, n_features)
+        y = np.random.rand(100) * 500
+        model = lgb.LGBMRegressor(n_estimators=5, verbose=-1)
+        model.fit(X, y)
+        model.booster_.save_model(str(tmp_path / "model_tflops.lgbm"))
+        spec = {
+            "feature_names": fe.get_feature_names(),
+            "categorical_features": fe.get_categorical_features(),
+        }
+        with open(tmp_path / "feature_spec.json", "w") as f:
+            json.dump(spec, f)
+        return tmp_path
+
+    def _make_pool(self):
+        """Create a small synthetic kernel pool."""
+        names = [
+            "gemm_universal_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_128x128x128_1x4x1_16x16x128",
+            "gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_128x128x64_2x2x1_32x32x16",
+            "gemm_universal_fp8_rcr_mem_cshuffle_interwave_False_False_False_False_64x64x128_1x4x1_16x16x32",
+        ]
+        return [kernel_config_to_feature_dict(n) for n in names]
+
+    def test_returns_ml_kernel_spec(self, mock_model_dir):
+        pool = self._make_pool()
+        heuristic = create_ml_heuristic(mock_model_dir, kernel_pool=pool)
+        result = heuristic(1024, 1024, 1024)
+        assert isinstance(result, MLKernelSpec)
+        assert result.tile_m > 0
+        assert isinstance(result.predicted_tflops, float)
+
+    def test_returns_valid_kernel_from_pool(self, mock_model_dir):
+        pool = self._make_pool()
+        pool_names = {p["kernel_name"] for p in pool}
+        heuristic = create_ml_heuristic(mock_model_dir, kernel_pool=pool)
+        result = heuristic(1024, 1024, 1024)
+        assert result.kernel_name in pool_names
+
+    def test_different_shapes_may_select_different_kernels(self, mock_model_dir):
+        pool = self._make_pool()
+        heuristic = create_ml_heuristic(mock_model_dir, kernel_pool=pool)
+        r1 = heuristic(16, 1536, 7168)
+        r2 = heuristic(8192, 8192, 256)
+        # At minimum both should return valid specs
+        assert r1.tile_m > 0
+        assert r2.tile_m > 0
+
+    def test_m1_corner_case(self, mock_model_dir):
+        pool = self._make_pool()
+        heuristic = create_ml_heuristic(mock_model_dir, kernel_pool=pool)
+        result = heuristic(1, 4096, 4096)
+        assert isinstance(result, MLKernelSpec)
+        assert np.isfinite(result.predicted_tflops)
+
+    def test_empty_pool_raises(self, mock_model_dir):
+        with pytest.raises(ValueError, match="No kernel configs"):
+            create_ml_heuristic(mock_model_dir, kernel_pool=[])
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_evaluate.py b/dispatcher/heuristics/tests/test_evaluate.py
new file mode 100644
index 0000000000..bcbe39af9d
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_evaluate.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for evaluate.py.
+
+Covers: shape family classification, K-depth regime classification,
+and basic evaluation metric checks.
+"""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from evaluate import classify_shape_family, classify_k_regime
+
+
+class TestClassifyShapeFamily:
+    def test_tiny_m(self):
+        assert classify_shape_family(1, 4096, 4096) == "tiny_m"
+        assert classify_shape_family(16, 1536, 7168) == "tiny_m"
+
+    def test_small_m(self):
+        assert classify_shape_family(32, 1536, 7168) == "small_m"
+        assert classify_shape_family(128, 4096, 4096) == "small_m"
+
+    def test_medium_m(self):
+        assert classify_shape_family(256, 1024, 1024) == "medium_m"
+        assert classify_shape_family(2048, 2048, 2048) == "medium_m"
+
+    def test_large_m(self):
+        assert classify_shape_family(4096, 4096, 4096) == "large_m"
+        assert classify_shape_family(20480, 7168, 256) == "large_m"
+
+
+class TestClassifyKRegime:
+    def test_shallow(self):
+        assert classify_k_regime(256) == "shallow_k"
+        assert classify_k_regime(32) == "shallow_k"
+
+    def test_medium(self):
+        assert classify_k_regime(1024) == "medium_k"
+        assert classify_k_regime(2048) == "medium_k"
+
+    def test_deep(self):
+        assert classify_k_regime(4096) == "deep_k"
+        assert classify_k_regime(7168) == "deep_k"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_feature_engine.py b/dispatcher/heuristics/tests/test_feature_engine.py
new file mode 100644
index 0000000000..492623ce99
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_feature_engine.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for feature_engine.py.
+
+Covers: feature count consistency, formula correctness (tile efficiency, LDS,
+arithmetic intensity), corner-case shapes (M=1, huge M, square, skinny-K),
+parameter space validity, config validation, and batch vs single extraction parity.
+"""
+
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from feature_engine import (
+    GemmUniversalFeatureEngine,
+)
+
+
+@pytest.fixture
+def fe():
+    """Default feature engine with MI355X-like hardware."""
+    return GemmUniversalFeatureEngine(
+        num_cus=256,
+        lds_capacity=65536,
+        max_clock_mhz=2400,
+        simds_per_cu=4,
+        shader_engines=32,
+        max_waves_per_cu=32,
+        wavefront_size=64,
+        l1_cache_kb=32,
+        l2_cache_kb=4096,
+        l3_cache_kb=262144,
+        num_xcd=8,
+    )
+
+
+def _make_problem(m=1024, n=1024, k=1024, dtype="fp8", layout="rcr", split_k=1):
+    return {
+        "m": m,
+        "n": n,
+        "k": k,
+        "dtype": dtype,
+        "layout": layout,
+        "split_k": split_k,
+    }
+
+
+def _make_kernel(
+    tile_m=128,
+    tile_n=128,
+    tile_k=64,
+    warp_m=2,
+    warp_n=2,
+    warp_k=1,
+    warp_tile_m=32,
+    warp_tile_n=32,
+    warp_tile_k=16,
+    pipeline="compv3",
+    scheduler="intrawave",
+    epilogue="cshuffle",
+    pad_m=False,
+    pad_n=False,
+    pad_k=False,
+    persistent=False,
+):
+    return {
+        "tile_m": tile_m,
+        "tile_n": tile_n,
+        "tile_k": tile_k,
+        "warp_m": warp_m,
+        "warp_n": warp_n,
+        "warp_k": warp_k,
+        "warp_tile_m": warp_tile_m,
+        "warp_tile_n": warp_tile_n,
+        "warp_tile_k": warp_tile_k,
+        "pipeline": pipeline,
+        "scheduler": scheduler,
+        "epilogue": epilogue,
+        "pad_m": pad_m,
+        "pad_n": pad_n,
+        "pad_k": pad_k,
+        "persistent": persistent,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Basic consistency
+# ---------------------------------------------------------------------------
+
+
+class TestFeatureConsistency:
+    def test_feature_count_matches_names(self, fe):
+        prob = _make_problem()
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        assert len(vec) == len(fe.get_feature_names())
+
+    def test_feature_count_is_72(self, fe):
+        assert len(fe.get_feature_names()) == 72
+
+    def test_no_nan_in_output(self, fe):
+        prob = _make_problem()
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        assert not np.any(np.isnan(vec))
+
+    def test_no_inf_in_output(self, fe):
+        prob = _make_problem()
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        assert not np.any(np.isinf(vec))
+
+    def test_categorical_features_in_names(self, fe):
+        names = fe.get_feature_names()
+        for cat in fe.get_categorical_features():
+            assert cat in names
+
+
+# ---------------------------------------------------------------------------
+# Formula correctness
+# ---------------------------------------------------------------------------
+
+
+class TestTileEfficiency:
+    """Tile efficiency: fraction of the last tile that is useful work."""
+
+    def test_perfectly_divisible(self, fe):
+        prob = _make_problem(m=256, n=256, k=128)
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("tile_eff_m")] == 1.0
+        assert vec[names.index("tile_eff_n")] == 1.0
+        assert vec[names.index("tile_eff_k")] == 1.0
+        assert vec[names.index("overall_tile_efficiency")] == 1.0
+
+    def test_not_divisible(self, fe):
+        prob = _make_problem(m=100, n=100, k=100)
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("tile_eff_m")] == pytest.approx(100 / 128)
+        assert vec[names.index("tile_eff_n")] == pytest.approx(100 / 128)
+        assert vec[names.index("tile_eff_k")] == pytest.approx(36 / 64)
+
+    def test_m_equals_1(self, fe):
+        """Single-token inference: M=1, tile_m=128 => eff = 1/128."""
+        prob = _make_problem(m=1)
+        kern = _make_kernel(tile_m=128)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("tile_eff_m")] == pytest.approx(1.0 / 128.0)
+
+
+class TestLDSUsage:
+    def test_lds_formula(self, fe):
+        prob = _make_problem(dtype="fp8")
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        expected = (128 * 64 + 128 * 64) * 1.0  # fp8 = 1 byte
+        assert vec[names.index("lds_usage_estimate")] == expected
+
+    def test_lds_ratio_compv4(self, fe):
+        """compv4 has 32KB LDS limit, not 64KB."""
+        prob = _make_problem(dtype="fp8")
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64, pipeline="compv4")
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        lds_est = (128 * 64 + 128 * 64) * 1.0
+        assert vec[names.index("lds_usage_ratio")] == pytest.approx(lds_est / 32768)
+
+    def test_lds_fp16_doubles(self, fe):
+        prob = _make_problem(dtype="fp16")
+        kern = _make_kernel(tile_m=128, tile_n=128, tile_k=64)
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        expected = (128 * 64 + 128 * 64) * 2.0  # fp16 = 2 bytes
+        assert vec[names.index("lds_usage_estimate")] == expected
+
+
+class TestArithmeticIntensity:
+    def test_square_shape(self, fe):
+        M, N, K = 1024, 1024, 1024
+        prob = _make_problem(m=M, n=N, k=K, dtype="fp8")
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        mem = (M * K + K * N + M * N) * 1.0
+        expected = (2.0 * M * N * K) / mem
+        assert vec[names.index("arithmetic_intensity")] == pytest.approx(expected)
+
+    def test_skinny_k(self, fe):
+        """Small K => low arithmetic intensity (memory-bound)."""
+        prob = _make_problem(m=8192, n=8192, k=32, dtype="fp8")
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("arithmetic_intensity")] < 100
+
+    def test_deep_k(self, fe):
+        """Large K => high arithmetic intensity (compute-bound)."""
+        prob = _make_problem(m=256, n=256, k=8192, dtype="fp8")
+        kern = _make_kernel()
+        vec = fe.extract(prob, kern)
+        names = fe.get_feature_names()
+        assert vec[names.index("arithmetic_intensity")] > 100
+
+
+# ---------------------------------------------------------------------------
+# Corner-case shapes
+# ---------------------------------------------------------------------------
+
+
+class TestCornerCaseShapes:
+    def test_m1_single_token(self, fe):
+        vec = fe.extract(_make_problem(m=1, n=4096, k=4096), _make_kernel())
+        assert not np.any(np.isnan(vec))
+
+    def test_m1_n1_k1_minimum(self, fe):
+        vec = fe.extract(_make_problem(m=1, n=1, k=1), _make_kernel())
+        assert not np.any(np.isnan(vec))
+        assert not np.any(np.isinf(vec))
+
+    def test_very_large_m(self, fe):
+        vec = fe.extract(_make_problem(m=20480, n=7168, k=7168), _make_kernel())
+        assert not np.any(np.isnan(vec))
+
+    def test_non_power_of_2(self, fe):
+        vec = fe.extract(_make_problem(m=1536, n=7168, k=2304), _make_kernel())
+        assert not np.any(np.isnan(vec))
+
+    def test_prime_dimensions(self, fe):
+        vec = fe.extract(_make_problem(m=17, n=31, k=127), _make_kernel())
+        assert not np.any(np.isnan(vec))
+
+    def test_tall_matrix(self, fe):
+        """M >> N (tall matrix)."""
+        prob = _make_problem(m=16384, n=64, k=1024)
+        vec = fe.extract(prob, _make_kernel())
+        names = fe.get_feature_names()
+        assert vec[names.index("aspect_ratio_mn")] > 100
+
+    def test_wide_matrix(self, fe):
+        """N >> M (wide matrix)."""
+        prob = _make_problem(m=64, n=16384, k=1024)
+        vec = fe.extract(prob, _make_kernel())
+        names = fe.get_feature_names()
+        assert vec[names.index("aspect_ratio_mn")] < 0.01
+
+
+# ---------------------------------------------------------------------------
+# Batch vs single extraction parity
+# ---------------------------------------------------------------------------
+
+
+class TestBatchParity:
+    def test_batch_matches_single(self, fe):
+        """Vectorized batch should produce identical results to row-by-row."""
+        rows = [
+            {
+                "m": 16,
+                "n": 1536,
+                "k": 7168,
+                "split_k": 1,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "tile_m": 128,
+                "tile_n": 128,
+                "tile_k": 128,
+                "warp_m": 1,
+                "warp_n": 4,
+                "warp_k": 1,
+                "warp_tile_m": 16,
+                "warp_tile_n": 16,
+                "warp_tile_k": 128,
+                "pipeline": "compv3",
+                "scheduler": "intrawave",
+                "epilogue": "cshuffle",
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+            },
+            {
+                "m": 20480,
+                "n": 7168,
+                "k": 256,
+                "split_k": 1,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "tile_m": 64,
+                "tile_n": 64,
+                "tile_k": 128,
+                "warp_m": 2,
+                "warp_n": 2,
+                "warp_k": 1,
+                "warp_tile_m": 32,
+                "warp_tile_n": 32,
+                "warp_tile_k": 16,
+                "pipeline": "mem",
+                "scheduler": "interwave",
+                "epilogue": "default",
+                "pad_m": True,
+                "pad_n": True,
+                "pad_k": True,
+                "persistent": True,
+            },
+        ]
+        df = pd.DataFrame(rows)
+        batch_result = fe.extract_batch(df)
+
+        for i, row_dict in enumerate(rows):
+            single_result = fe.extract(row_dict, row_dict)
+            np.testing.assert_allclose(
+                batch_result[i],
+                single_result,
+                rtol=1e-5,
+                atol=1e-5,
+                err_msg=f"Mismatch at row {i}",
+            )
+
+
+# ---------------------------------------------------------------------------
+# Parameter space and validation
+# ---------------------------------------------------------------------------
+
+
+class TestParameterSpace:
+    def test_parameter_space_non_empty(self, fe):
+        ps = fe.get_parameter_space()
+        assert len(ps) > 0
+        assert "tile_m" in ps
+        assert "pipeline" in ps
+
+    def test_valid_config_passes(self, fe):
+        config = {
+            "tile_m": 128,
+            "tile_n": 128,
+            "tile_k": 64,
+            "warp_m": 2,
+            "warp_n": 2,
+            "warp_k": 1,
+            "pipeline": "compv3",
+            "scheduler": "intrawave",
+            "epilogue": "cshuffle",
+            "pad_m": False,
+            "pad_n": False,
+            "pad_k": False,
+            "persistent": False,
+        }
+        assert fe.validate_config(config) is True
+
+    def test_invalid_tile_rejected(self, fe):
+        config = {"tile_m": 999}
+        assert fe.validate_config(config) is False
+
+    def test_lds_constraint_rejects_huge_tile(self, fe):
+        config = {
+            "tile_m": 256,
+            "tile_n": 256,
+            "tile_k": 256,
+            "warp_m": 2,
+            "warp_n": 2,
+            "warp_k": 1,
+            "pipeline": "compv4",
+        }
+        assert fe.validate_config(config) is False
+
+    def test_project_to_valid_snaps(self, fe):
+        config = {"tile_m": 100, "tile_n": 200, "pipeline": "compv3"}
+        projected = fe.project_to_valid(config)
+        assert projected["tile_m"] == 128
+        assert projected["tile_n"] == 192
+        assert projected["pipeline"] == "compv3"
+
+
+# ---------------------------------------------------------------------------
+# Hardware features
+# ---------------------------------------------------------------------------
+
+
+class TestHardwareFeatures:
+    def test_hardware_values_propagated(self, fe):
+        vec = fe.extract(_make_problem(), _make_kernel())
+        names = fe.get_feature_names()
+        assert vec[names.index("hw_num_cus")] == 256
+        assert vec[names.index("hw_max_clock_mhz")] == 2400
+        assert vec[names.index("hw_total_simds")] == 256 * 4
+        assert vec[names.index("hw_num_xcd")] == 8
+
+    def test_different_hardware(self):
+        fe_small = GemmUniversalFeatureEngine(num_cus=120, max_clock_mhz=1800)
+        vec = fe_small.extract(_make_problem(), _make_kernel())
+        names = fe_small.get_feature_names()
+        assert vec[names.index("hw_num_cus")] == 120
+        assert vec[names.index("hw_max_clock_mhz")] == 1800
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_feature_parity.py b/dispatcher/heuristics/tests/test_feature_parity.py
new file mode 100644
index 0000000000..43f6968b88
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_feature_parity.py
@@ -0,0 +1,357 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Test that the C++ extract_features() in ml_heuristic.hpp produces identical
+values to the Python GemmUniversalFeatureEngine.extract().
+
+This test uses ctypes to call the C++ feature extraction compiled into a
+small shared library, then compares against Python output. If compilation
+fails (no HIP/ROCm), it falls back to verifying the Python feature engine
+against manually computed expected values for specific test cases.
+"""
+
+import math
+import sys
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from feature_engine import (
+    GemmUniversalFeatureEngine,
+    PIPELINE_MAP,
+    SCHEDULER_MAP,
+    EPILOGUE_MAP,
+    LAYOUT_MAP,
+)
+
+
+def _compute_features_manually(
+    M,
+    N,
+    K,
+    split_k,
+    dtype,
+    layout,
+    tile_m,
+    tile_n,
+    tile_k,
+    warp_m,
+    warp_n,
+    warp_k,
+    warp_tile_m,
+    warp_tile_n,
+    warp_tile_k,
+    pipeline,
+    scheduler,
+    epilogue,
+    pad_m,
+    pad_n,
+    pad_k,
+    persistent,
+    hw,
+):
+    """Recompute features independently to verify the Python engine."""
+    bpe_map = {"fp8": 1.0, "fp16": 2.0, "bf16": 2.0, "fp32": 4.0}
+    bpe = bpe_map.get(dtype, 1.0)
+
+    log2_M = math.log2(max(M, 1))
+    log2_N = math.log2(max(N, 1))
+    log2_K = math.log2(max(K, 1))
+    log2_MNK = math.log2(max(M * N * K, 1))
+    mem = (M * K + K * N + M * N) * bpe
+    ai = (2.0 * M * N * K) / max(mem, 1)
+
+    lds_est = (tile_m * tile_k + tile_n * tile_k) * bpe
+    lds_cap = 32768 if pipeline == "compv4" else hw["lds_capacity"]
+
+    ntm = math.ceil(M / max(tile_m, 1))
+    ntn = math.ceil(N / max(tile_n, 1))
+    ntk = math.ceil(K / max(tile_k, 1))
+
+    def eff(d, t):
+        if t <= 0:
+            return 1.0
+        r = d % t
+        return r / t if r > 0 else 1.0
+
+    # Problem-to-tile ratios
+    ratio_M_to_tile_m = M / max(tile_m, 1)
+    ratio_N_to_tile_n = N / max(tile_n, 1)
+    ratio_K_to_tile_k = K / max(tile_k, 1)
+
+    # Binary features: problem smaller than tile
+    problem_smaller_than_tile_m = float(M < tile_m)
+    problem_smaller_than_tile_n = float(N < tile_n)
+    problem_smaller_than_tile_k = float(K < tile_k)
+    any_dim_too_small = float((M < tile_m) or (N < tile_n) or (K < tile_k))
+
+    # Padding requirement features
+    needs_padding_m = float(tile_m > 0 and M % tile_m != 0)
+    needs_padding_n = float(tile_n > 0 and N % tile_n != 0)
+    needs_padding_k = float(tile_k > 0 and K % tile_k != 0)
+
+    # Interaction features
+    has_padding_when_needed_m = float(needs_padding_m and pad_m)
+    has_padding_when_needed_n = float(needs_padding_n and pad_n)
+    has_padding_when_needed_k = float(needs_padding_k and pad_k)
+
+    # Missing padding features
+    missing_required_padding_m = float(needs_padding_m and not pad_m)
+    missing_required_padding_n = float(needs_padding_n and not pad_n)
+    missing_required_padding_k = float(needs_padding_k and not pad_k)
+    missing_any_required_padding = float(
+        missing_required_padding_m or missing_required_padding_n or missing_required_padding_k
+    )
+
+    return [
+        M,                                      # 0
+        N,                                      # 1
+        K,                                      # 2
+        split_k,                                # 3
+        log2_M,                                 # 4
+        log2_N,                                 # 5
+        log2_K,                                 # 6
+        log2_MNK,                               # 7
+        ai,                                     # 8
+        M / max(N, 1),                          # 9 (aspect_ratio_mn)
+        M / max(K, 1),                          # 10 (aspect_ratio_mk)
+        N / max(K, 1),                          # 11 (aspect_ratio_nk)
+        LAYOUT_MAP.get(layout, 0),              # 12
+        tile_m,                                 # 13
+        tile_n,                                 # 14
+        tile_k,                                 # 15
+        warp_m,                                 # 16
+        warp_n,                                 # 17
+        warp_k,                                 # 18
+        warp_tile_m,                            # 19
+        warp_tile_n,                            # 20
+        warp_tile_k,                            # 21
+        PIPELINE_MAP.get(pipeline, 0),          # 22
+        SCHEDULER_MAP.get(scheduler, 0),        # 23
+        EPILOGUE_MAP.get(epilogue, 0),          # 24
+        float(pad_m),                           # 25
+        float(pad_n),                           # 26
+        float(pad_k),                           # 27
+        float(persistent),                      # 28
+        warp_m * warp_n * warp_k,               # 29 (num_warps)
+        tile_m * tile_n * tile_k,               # 30 (tile_volume)
+        tile_m * tile_n,                        # 31 (tile_mn)
+        lds_est,                                # 32 (lds_usage_estimate)
+        lds_est / max(lds_cap, 1),              # 33 (lds_usage_ratio)
+        ntm,                                    # 34 (num_tiles_m)
+        ntn,                                    # 35 (num_tiles_n)
+        ntk,                                    # 36 (num_tiles_k)
+        ntm * ntn,                              # 37 (total_output_tiles)
+        eff(M, tile_m),                         # 38 (tile_eff_m)
+        eff(N, tile_n),                         # 39 (tile_eff_n)
+        eff(K, tile_k),                         # 40 (tile_eff_k)
+        eff(M, tile_m) * eff(N, tile_n) * eff(K, tile_k),  # 41 (overall_tile_efficiency)
+        ntm * ntn / max(hw["num_cus"], 1),      # 42 (cu_utilization)
+        ratio_M_to_tile_m,                      # 43
+        ratio_N_to_tile_n,                      # 44
+        ratio_K_to_tile_k,                      # 45
+        problem_smaller_than_tile_m,            # 46
+        problem_smaller_than_tile_n,            # 47
+        problem_smaller_than_tile_k,            # 48
+        any_dim_too_small,                      # 49
+        needs_padding_m,                        # 50
+        needs_padding_n,                        # 51
+        needs_padding_k,                        # 52
+        has_padding_when_needed_m,              # 53
+        has_padding_when_needed_n,              # 54
+        has_padding_when_needed_k,              # 55
+        missing_required_padding_m,             # 56
+        missing_required_padding_n,             # 57
+        missing_required_padding_k,             # 58
+        missing_any_required_padding,           # 59
+        hw["num_cus"],                          # 60
+        hw["simds_per_cu"],                     # 61
+        hw["num_cus"] * hw["simds_per_cu"],     # 62 (total_simds)
+        hw["shader_engines"],                   # 63
+        hw["max_clock_mhz"],                    # 64
+        hw["max_waves_per_cu"],                 # 65
+        hw["wavefront_size"],                   # 66
+        hw["lds_capacity"],                     # 67
+        hw["l1_cache_kb"],                      # 68
+        hw["l2_cache_kb"],                      # 69
+        hw["l3_cache_kb"],                      # 70
+        hw["num_xcd"],                          # 71
+    ]
+
+
+TEST_CASES = [
+    {
+        "problem": {
+            "m": 1024,
+            "n": 1024,
+            "k": 1024,
+            "split_k": 1,
+            "dtype": "fp8",
+            "layout": "rcr",
+        },
+        "kernel": {
+            "tile_m": 128,
+            "tile_n": 128,
+            "tile_k": 64,
+            "warp_m": 2,
+            "warp_n": 2,
+            "warp_k": 1,
+            "warp_tile_m": 32,
+            "warp_tile_n": 32,
+            "warp_tile_k": 16,
+            "pipeline": "compv3",
+            "scheduler": "intrawave",
+            "epilogue": "cshuffle",
+            "pad_m": False,
+            "pad_n": False,
+            "pad_k": False,
+            "persistent": False,
+        },
+    },
+    {
+        "problem": {
+            "m": 1,
+            "n": 4096,
+            "k": 4096,
+            "split_k": 1,
+            "dtype": "fp8",
+            "layout": "rcr",
+        },
+        "kernel": {
+            "tile_m": 64,
+            "tile_n": 64,
+            "tile_k": 128,
+            "warp_m": 1,
+            "warp_n": 4,
+            "warp_k": 1,
+            "warp_tile_m": 16,
+            "warp_tile_n": 16,
+            "warp_tile_k": 128,
+            "pipeline": "compv4",
+            "scheduler": "interwave",
+            "epilogue": "default",
+            "pad_m": True,
+            "pad_n": True,
+            "pad_k": True,
+            "persistent": True,
+        },
+    },
+    {
+        "problem": {
+            "m": 20480,
+            "n": 7168,
+            "k": 256,
+            "split_k": 1,
+            "dtype": "fp16",
+            "layout": "rrr",
+        },
+        "kernel": {
+            "tile_m": 256,
+            "tile_n": 256,
+            "tile_k": 32,
+            "warp_m": 4,
+            "warp_n": 1,
+            "warp_k": 1,
+            "warp_tile_m": 32,
+            "warp_tile_n": 32,
+            "warp_tile_k": 16,
+            "pipeline": "mem",
+            "scheduler": "interwave",
+            "epilogue": "cshuffle",
+            "pad_m": False,
+            "pad_n": False,
+            "pad_k": False,
+            "persistent": False,
+        },
+    },
+]
+
+HW = {
+    "num_cus": 256,
+    "simds_per_cu": 4,
+    "shader_engines": 32,
+    "max_clock_mhz": 2400,
+    "max_waves_per_cu": 32,
+    "wavefront_size": 64,
+    "lds_capacity": 65536,
+    "l1_cache_kb": 32,
+    "l2_cache_kb": 4096,
+    "l3_cache_kb": 262144,
+    "num_xcd": 8,
+}
+
+
+class TestFeatureParity:
+    """Verify Python feature engine matches manual computation (C++ uses same logic)."""
+
+    @pytest.fixture
+    def fe(self):
+        return GemmUniversalFeatureEngine(**HW)
+
+    @pytest.mark.parametrize("case_idx", range(len(TEST_CASES)))
+    def test_python_matches_manual(self, fe, case_idx):
+        case = TEST_CASES[case_idx]
+        prob = case["problem"]
+        kern = case["kernel"]
+
+        py_features = fe.extract(prob, kern)
+
+        manual = _compute_features_manually(
+            prob["m"],
+            prob["n"],
+            prob["k"],
+            prob["split_k"],
+            prob["dtype"],
+            prob["layout"],
+            kern["tile_m"],
+            kern["tile_n"],
+            kern["tile_k"],
+            kern["warp_m"],
+            kern["warp_n"],
+            kern["warp_k"],
+            kern["warp_tile_m"],
+            kern["warp_tile_n"],
+            kern["warp_tile_k"],
+            kern["pipeline"],
+            kern["scheduler"],
+            kern["epilogue"],
+            kern["pad_m"],
+            kern["pad_n"],
+            kern["pad_k"],
+            kern["persistent"],
+            HW,
+        )
+
+        manual_arr = np.array(manual, dtype=np.float64)
+        assert len(py_features) == len(manual_arr) == 72
+
+        for i in range(72):
+            assert py_features[i] == pytest.approx(
+                manual_arr[i], rel=1e-10, abs=1e-15
+            ), (
+                f"Feature {i} ({fe.get_feature_names()[i]}): Python={py_features[i]}, Manual={manual_arr[i]}"
+            )
+
+    def test_feature_count(self, fe):
+        assert len(fe.get_feature_names()) == 72
+
+    def test_encoding_maps_match_cpp(self):
+        """The C++ encode_* functions must use the same mapping as Python."""
+        assert PIPELINE_MAP == {
+            "compv3": 0,
+            "compv4": 1,
+            "compv5": 2,
+            "mem": 3,
+            "preshufflev2": 4,
+        }
+        assert SCHEDULER_MAP == {"intrawave": 0, "interwave": 1}
+        assert EPILOGUE_MAP == {"default": 0, "cshuffle": 1}
+        assert LAYOUT_MAP == {"rcr": 0, "rrr": 1, "crr": 2, "ccr": 3}
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_model_compression.py b/dispatcher/heuristics/tests/test_model_compression.py
new file mode 100644
index 0000000000..50727f1242
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_model_compression.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""Test that compressed models can be loaded and used."""
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from predict import Predictor
+
+
+def test_fp16_model_decompression():
+    """Test that fp16 model is auto-decompressed and usable."""
+    model_dir = Path(__file__).parent.parent / "models" / "gemm_universal_fp16_gfx950"
+
+    # Ensure .lgbm.gz exists
+    gz_file = model_dir / "model_tflops.lgbm.gz"
+
+    assert gz_file.exists(), f"Compressed model not found: {gz_file}"
+
+    # Load predictor - should auto-decompress
+    predictor = Predictor(model_dir)
+
+    # Test prediction
+    problem = {"m": 128, "n": 1536, "k": 7168, "dtype": "fp16", "layout": "rcr"}
+    kernel_config = {
+        "tile_shape": {"m0": 128, "n0": 128, "k0": 16},
+        "wave_shape": {"m1": 2, "n1": 2, "k1": 1},
+        "warp_tile": {"m2": 32, "n2": 32, "k2": 8},
+    }
+
+    tflops = predictor.predict_tflops(problem, kernel_config)
+
+    assert isinstance(tflops, float), f"Expected float, got {type(tflops)}"
+    assert tflops > 0, f"Expected positive TFLOPS, got {tflops}"
+
+    # Verify decompressed file was created
+    lgbm_file = model_dir / "model_tflops.lgbm"
+    assert lgbm_file.exists(), "Model should have been decompressed"
+
+    print(f"✅ FP16 model decompression test passed")
+    print(f"   Predicted TFLOPS: {tflops:.2f}")
+    print(f"   Decompressed to: {lgbm_file}")
+    return True
+
+
+def test_fp8_model_decompression():
+    """Test that fp8 model is auto-decompressed and usable."""
+    model_dir = Path(__file__).parent.parent / "models" / "gemm_universal_fp8_gfx950"
+
+    # Ensure .lgbm.gz exists
+    gz_file = model_dir / "model_tflops.lgbm.gz"
+
+    assert gz_file.exists(), f"Compressed model not found: {gz_file}"
+
+    # Load predictor - should auto-decompress
+    predictor = Predictor(model_dir)
+
+    # Test prediction
+    problem = {"m": 2048, "n": 2048, "k": 2048, "dtype": "fp8", "layout": "rcr"}
+    kernel_config = {
+        "tile_shape": {"m0": 256, "n0": 256, "k0": 64},
+        "wave_shape": {"m1": 2, "n1": 2, "k1": 1},
+        "warp_tile": {"m2": 32, "n2": 32, "k2": 16},
+    }
+
+    tflops = predictor.predict_tflops(problem, kernel_config)
+
+    assert isinstance(tflops, float), f"Expected float, got {type(tflops)}"
+    assert tflops > 0, f"Expected positive TFLOPS, got {tflops}"
+
+    # Verify decompressed file was created
+    lgbm_file = model_dir / "model_tflops.lgbm"
+    assert lgbm_file.exists(), "Model should have been decompressed"
+
+    print(f"✅ FP8 model decompression test passed")
+    print(f"   Predicted TFLOPS: {tflops:.2f}")
+    print(f"   Decompressed to: {lgbm_file}")
+    return True
+
+
+if __name__ == "__main__":
+    print("Testing compressed model auto-decompression...")
+    print()
+
+    test_fp16_model_decompression()
+    print()
+    test_fp8_model_decompression()
+    print()
+    print("✅ All model compression tests passed!")
diff --git a/dispatcher/heuristics/tests/test_predict.py b/dispatcher/heuristics/tests/test_predict.py
new file mode 100644
index 0000000000..24cb26c4fa
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_predict.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for predict.py.
+
+Covers: Predictor initialization, single prediction, ranking, select_best,
+missing model handling, and edge cases (single kernel, empty list).
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from feature_engine import GemmUniversalFeatureEngine
+from predict import Predictor
+
+
+@pytest.fixture
+def model_dir(tmp_path):
+    """Create a minimal trained model for testing."""
+    fe = GemmUniversalFeatureEngine()
+    n_features = len(fe.get_feature_names())
+
+    np.random.seed(42)
+    X = np.random.rand(200, n_features)
+    y = np.random.rand(200) * 100
+
+    model = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
+    model.fit(X, y)
+    model.booster_.save_model(str(tmp_path / "model_tflops.lgbm"))
+
+    y_lat = np.random.rand(200) * 0.1
+    model_lat = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
+    model_lat.fit(X, y_lat)
+    model_lat.booster_.save_model(str(tmp_path / "model_latency.lgbm"))
+
+    spec = {
+        "feature_names": fe.get_feature_names(),
+        "categorical_features": fe.get_categorical_features(),
+    }
+    with open(tmp_path / "feature_spec.json", "w") as f:
+        json.dump(spec, f)
+
+    return tmp_path
+
+
+@pytest.fixture
+def predictor(model_dir):
+    return Predictor(model_dir)
+
+
+def _problem():
+    return {
+        "m": 1024,
+        "n": 1024,
+        "k": 1024,
+        "dtype": "fp8",
+        "layout": "rcr",
+        "split_k": 1,
+    }
+
+
+def _kernel(tile_m=128, pipeline="compv3"):
+    return {
+        "kernel_name": f"test_kernel_{tile_m}_{pipeline}",
+        "tile_m": tile_m,
+        "tile_n": 128,
+        "tile_k": 64,
+        "warp_m": 2,
+        "warp_n": 2,
+        "warp_k": 1,
+        "warp_tile_m": 32,
+        "warp_tile_n": 32,
+        "warp_tile_k": 16,
+        "pipeline": pipeline,
+        "scheduler": "intrawave",
+        "epilogue": "cshuffle",
+        "pad_m": False,
+        "pad_n": False,
+        "pad_k": False,
+        "persistent": False,
+    }
+
+
+class TestPredictor:
+    def test_predict_tflops_returns_float(self, predictor):
+        result = predictor.predict_tflops(_problem(), _kernel())
+        assert isinstance(result, float)
+
+    def test_predict_latency_returns_float(self, predictor):
+        result = predictor.predict_latency(_problem(), _kernel())
+        assert isinstance(result, float)
+
+    def test_predict_all_returns_dict(self, predictor):
+        result = predictor.predict_all(_problem(), _kernel())
+        assert "tflops" in result
+        assert "latency_ms" in result
+
+    def test_rank_kernels_sorted_descending(self, predictor):
+        kernels = [_kernel(64, "compv3"), _kernel(128, "compv4"), _kernel(256, "mem")]
+        ranked = predictor.rank_kernels(_problem(), kernels)
+        assert len(ranked) == 3
+        scores = [s for _, s in ranked]
+        assert scores == sorted(scores, reverse=True)
+
+    def test_select_best_returns_name(self, predictor):
+        kernels = [_kernel(64), _kernel(128)]
+        best = predictor.select_best(_problem(), kernels)
+        assert isinstance(best, str)
+        assert best in [k["kernel_name"] for k in kernels]
+
+    def test_single_kernel(self, predictor):
+        kernels = [_kernel(128)]
+        ranked = predictor.rank_kernels(_problem(), kernels)
+        assert len(ranked) == 1
+
+    def test_missing_bandwidth_model(self, model_dir):
+        pred = Predictor(model_dir)
+        with pytest.raises(FileNotFoundError):
+            pred.predict_bandwidth(_problem(), _kernel())
+
+    def test_empty_kernel_list(self, predictor):
+        with pytest.raises(ValueError):
+            predictor.select_best(_problem(), [])
+
+    def test_corner_case_m1(self, predictor):
+        prob = {
+            "m": 1,
+            "n": 4096,
+            "k": 4096,
+            "dtype": "fp8",
+            "layout": "rcr",
+            "split_k": 1,
+        }
+        result = predictor.predict_tflops(prob, _kernel())
+        assert np.isfinite(result)
+
+    def test_different_shapes_give_different_results(self, predictor):
+        k = _kernel()
+        r1 = predictor.predict_tflops(
+            {
+                "m": 16,
+                "n": 1536,
+                "k": 7168,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            k,
+        )
+        r2 = predictor.predict_tflops(
+            {
+                "m": 20480,
+                "n": 7168,
+                "k": 256,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            k,
+        )
+        assert r1 != r2
+
+
+class TestPredictorEdgeCases:
+    def test_nonexistent_model_dir(self):
+        with pytest.raises(Exception):
+            pred = Predictor("/nonexistent/path")
+            pred.predict_tflops(_problem(), _kernel())
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_search.py b/dispatcher/heuristics/tests/test_search.py
new file mode 100644
index 0000000000..b1d1ac79b3
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_search.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for search.py.
+
+Covers: random search, DE search, config validity, result ordering,
+budget compliance, and edge cases.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from feature_engine import GemmUniversalFeatureEngine
+from predict import Predictor
+from search import SurrogateSearch
+
+
+@pytest.fixture
+def model_dir(tmp_path):
+    """Create a minimal trained model."""
+    fe = GemmUniversalFeatureEngine()
+    n_features = len(fe.get_feature_names())
+    np.random.seed(42)
+    X = np.random.rand(200, n_features)
+    y = np.random.rand(200) * 500
+    model = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
+    model.fit(X, y)
+    model.booster_.save_model(str(tmp_path / "model_tflops.lgbm"))
+    spec = {
+        "feature_names": fe.get_feature_names(),
+        "categorical_features": fe.get_categorical_features(),
+    }
+    with open(tmp_path / "feature_spec.json", "w") as f:
+        json.dump(spec, f)
+    return tmp_path
+
+
+@pytest.fixture
+def predictor(model_dir):
+    return Predictor(model_dir)
+
+
+def _problem():
+    return {
+        "m": 1024,
+        "n": 1024,
+        "k": 1024,
+        "dtype": "fp8",
+        "layout": "rcr",
+        "split_k": 1,
+    }
+
+
+class TestRandomSearch:
+    def test_returns_results(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(_problem(), budget=50, top_k=5)
+        assert len(results) > 0
+        assert len(results) <= 5
+
+    def test_results_sorted_descending(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(_problem(), budget=100, top_k=10)
+        scores = [s for _, s in results]
+        assert scores == sorted(scores, reverse=True)
+
+    def test_configs_are_valid(self, predictor):
+        fe = GemmUniversalFeatureEngine()
+        searcher = SurrogateSearch(predictor, feature_engine=fe, strategy="random")
+        results = searcher.search(_problem(), budget=50, top_k=5)
+        for cfg, _ in results:
+            ps = fe.get_parameter_space()
+            for k, v in cfg.items():
+                if k in ps:
+                    assert v in ps[k], f"{k}={v} not in {ps[k]}"
+
+    def test_respects_top_k(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(_problem(), budget=100, top_k=3)
+        assert len(results) <= 3
+
+    def test_different_problems_produce_results(self, predictor):
+        """Both problem sizes should produce valid search results."""
+        searcher = SurrogateSearch(predictor, strategy="random", seed=42)
+        r1 = searcher.search(
+            {
+                "m": 16,
+                "n": 1536,
+                "k": 7168,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            budget=50,
+            top_k=3,
+        )
+        searcher2 = SurrogateSearch(predictor, strategy="random", seed=42)
+        r2 = searcher2.search(
+            {
+                "m": 20480,
+                "n": 7168,
+                "k": 256,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            budget=50,
+            top_k=3,
+        )
+        assert len(r1) > 0
+        assert len(r2) > 0
+        for _, score in r1 + r2:
+            assert np.isfinite(score)
+
+    def test_m1_corner_case(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(
+            {
+                "m": 1,
+                "n": 4096,
+                "k": 4096,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "split_k": 1,
+            },
+            budget=50,
+            top_k=5,
+        )
+        assert len(results) > 0
+        for _, score in results:
+            assert np.isfinite(score)
+
+
+class TestDESearch:
+    def test_returns_results(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="de")
+        results = searcher.search(_problem(), budget=100, top_k=5)
+        assert len(results) > 0
+
+    def test_results_sorted_descending(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="de")
+        results = searcher.search(_problem(), budget=100, top_k=5)
+        scores = [s for _, s in results]
+        assert scores == sorted(scores, reverse=True)
+
+    def test_de_improves_over_initial(self, predictor):
+        """DE should generally find at least as good as random initialization."""
+        searcher_r = SurrogateSearch(predictor, strategy="random", seed=42)
+        r_results = searcher_r.search(_problem(), budget=100, top_k=1)
+        searcher_d = SurrogateSearch(predictor, strategy="de", seed=42)
+        d_results = searcher_d.search(_problem(), budget=100, top_k=1)
+        if r_results and d_results:
+            assert d_results[0][1] >= r_results[0][1] * 0.9
+
+    def test_small_budget(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="de")
+        results = searcher.search(_problem(), budget=30, top_k=5)
+        assert len(results) > 0
+
+
+class TestSearchEdgeCases:
+    def test_unknown_strategy_raises(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="unknown")
+        with pytest.raises(ValueError):
+            searcher.search(_problem(), budget=10)
+
+    def test_zero_budget(self, predictor):
+        searcher = SurrogateSearch(predictor, strategy="random")
+        results = searcher.search(_problem(), budget=0, top_k=5)
+        assert len(results) == 0
+
+    def test_deterministic_with_same_seed(self, predictor):
+        s1 = SurrogateSearch(predictor, strategy="random", seed=123)
+        s2 = SurrogateSearch(predictor, strategy="random", seed=123)
+        r1 = s1.search(_problem(), budget=50, top_k=5)
+        r2 = s2.search(_problem(), budget=50, top_k=5)
+        assert len(r1) == len(r2)
+        for (c1, s1_), (c2, s2_) in zip(r1, r2):
+            assert s1_ == pytest.approx(s2_)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/tests/test_train.py b/dispatcher/heuristics/tests/test_train.py
new file mode 100644
index 0000000000..d437030bfa
--- /dev/null
+++ b/dispatcher/heuristics/tests/test_train.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Tests for train.py.
+
+Covers: group key computation, TFLOPS efficiency calculation, edge cases
+(single group, all-invalid data, tied predictions), and warm-start
+incremental training (feature compat, lineage, quality).
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from feature_engine import GemmUniversalFeatureEngine
+from train import (
+    compute_group_keys,
+    compute_tflops_efficiency,
+    check_feature_compatibility,
+    load_warm_start_model,
+    train_final_model,
+    DEFAULT_PARAMS,
+)
+
+
+class TestComputeGroupKeys:
+    def test_basic(self):
+        df = pd.DataFrame(
+            {"m": [16, 16, 32], "n": [1536, 1536, 1536], "k": [7168, 7168, 7168]}
+        )
+        keys = compute_group_keys(df)
+        assert keys[0] == keys[1]
+        assert keys[0] != keys[2]
+
+    def test_unique_shapes(self):
+        df = pd.DataFrame({"m": [1, 2, 3], "n": [4, 5, 6], "k": [7, 8, 9]})
+        keys = compute_group_keys(df)
+        assert len(set(keys)) == 3
+
+
+class TestComputeTflopsEfficiency:
+    def test_perfect_prediction(self):
+        """Model predicts highest TFLOPS kernel => efficiency = 1.0."""
+        df = pd.DataFrame(
+            {
+                "m": [1024, 1024, 1024],
+                "n": [1024, 1024, 1024],
+                "k": [1024, 1024, 1024],
+                "measured_tflops": [100, 200, 150],
+                "pred_tflops": [50, 300, 100],  # correctly ranks kernel 1 highest
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 1
+        assert eff["efficiency"].iloc[0] == pytest.approx(1.0)
+
+    def test_worst_prediction(self):
+        """Model picks the worst kernel."""
+        df = pd.DataFrame(
+            {
+                "m": [1024, 1024, 1024],
+                "n": [1024, 1024, 1024],
+                "k": [1024, 1024, 1024],
+                "measured_tflops": [100, 200, 150],
+                "pred_tflops": [999, 1, 1],  # incorrectly ranks kernel 0 highest
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert eff["efficiency"].iloc[0] == pytest.approx(100 / 200)
+
+    def test_multiple_shapes(self):
+        df = pd.DataFrame(
+            {
+                "m": [16, 16, 32, 32],
+                "n": [1536, 1536, 1536, 1536],
+                "k": [7168, 7168, 7168, 7168],
+                "measured_tflops": [10, 20, 100, 200],
+                "pred_tflops": [5, 25, 150, 190],
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 2
+        assert eff.iloc[0]["efficiency"] == pytest.approx(1.0)
+        assert eff.iloc[1]["efficiency"] == pytest.approx(1.0)
+
+    def test_zero_tflops_shape_skipped(self):
+        df = pd.DataFrame(
+            {
+                "m": [16, 16],
+                "n": [16, 16],
+                "k": [16, 16],
+                "measured_tflops": [0, 0],
+                "pred_tflops": [1, 2],
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 0
+
+    def test_single_kernel_per_shape(self):
+        df = pd.DataFrame(
+            {
+                "m": [1024],
+                "n": [1024],
+                "k": [1024],
+                "measured_tflops": [150],
+                "pred_tflops": [100],
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 1
+        assert eff["efficiency"].iloc[0] == pytest.approx(1.0)
+
+    def test_tied_predictions(self):
+        """When multiple kernels have the same predicted TFLOPS, pandas idxmax picks the first."""
+        df = pd.DataFrame(
+            {
+                "m": [1024, 1024, 1024],
+                "n": [1024, 1024, 1024],
+                "k": [1024, 1024, 1024],
+                "measured_tflops": [100, 200, 200],
+                "pred_tflops": [50, 50, 50],
+            }
+        )
+        eff = compute_tflops_efficiency(df, "pred_tflops")
+        assert len(eff) == 1
+        assert eff["efficiency"].iloc[0] >= 0.5
+
+
+# ---------------------------------------------------------------------------
+# Helpers for warm-start tests
+# ---------------------------------------------------------------------------
+
+
+def _make_dummy_data(n_rows=200, n_shapes=5):
+    """Create a small synthetic benchmark DataFrame for testing training."""
+    rng = np.random.RandomState(42)
+    rows = []
+    for _ in range(n_rows):
+        m = rng.choice([64, 128, 256, 512, 1024])
+        n = rng.choice([64, 128, 256, 512, 1024])
+        k = rng.choice([64, 128, 256, 512, 1024])
+        rows.append(
+            {
+                "m": m,
+                "n": n,
+                "k": k,
+                "split_k": 1,
+                "dtype": "fp8",
+                "layout": "rcr",
+                "op_type": "gemm_universal",
+                "tile_m": rng.choice([64, 128, 256]),
+                "tile_n": rng.choice([64, 128, 256]),
+                "tile_k": rng.choice([32, 64, 128]),
+                "warp_m": rng.choice([1, 2, 4]),
+                "warp_n": rng.choice([1, 2, 4]),
+                "warp_k": 1,
+                "warp_tile_m": 32,
+                "warp_tile_n": 32,
+                "warp_tile_k": 16,
+                "pipeline": rng.choice(["compv3", "compv4", "mem"]),
+                "scheduler": rng.choice(["intrawave", "interwave"]),
+                "epilogue": "cshuffle",
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+                "measured_tflops": float(rng.uniform(10, 500)),
+                "latency_ms": float(rng.uniform(0.01, 1.0)),
+                "bandwidth_gb_s": float(rng.uniform(50, 1500)),
+                "is_valid": True,
+                "kernel_name": f"test_kernel_{rng.randint(0, 100)}",
+            }
+        )
+    return pd.DataFrame(rows)
+
+
+def _save_feature_spec(model_dir, fe):
+    """Save a feature_spec.json matching the given feature engine."""
+    spec = {
+        "feature_names": fe.get_feature_names(),
+        "categorical_features": fe.get_categorical_features(),
+    }
+    with open(model_dir / "feature_spec.json", "w") as f:
+        json.dump(spec, f)
+
+
+def _train_and_save_base_model(model_dir, df, fe, target="tflops"):
+    """Train a small base model and save it to model_dir."""
+    params = dict(DEFAULT_PARAMS)
+    params["n_estimators"] = 20
+    params["n_jobs"] = 1
+    model = train_final_model(df, fe, target, params)
+    model.booster_.save_model(str(model_dir / f"model_{target}.lgbm"))
+    _save_feature_spec(model_dir, fe)
+    return model
+
+
+# ---------------------------------------------------------------------------
+# Warm-start tests
+# ---------------------------------------------------------------------------
+
+
+class TestCheckFeatureCompatibility:
+    def test_compatible_passes(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        _save_feature_spec(tmp_path, fe)
+        check_feature_compatibility(tmp_path, fe)
+
+    def test_missing_spec_raises(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        with pytest.raises(FileNotFoundError, match="feature_spec.json"):
+            check_feature_compatibility(tmp_path, fe)
+
+    def test_added_feature_raises(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        spec = {
+            "feature_names": fe.get_feature_names()[:-1],
+            "categorical_features": fe.get_categorical_features(),
+        }
+        with open(tmp_path / "feature_spec.json", "w") as f:
+            json.dump(spec, f)
+        with pytest.raises(ValueError, match="Feature schema mismatch"):
+            check_feature_compatibility(tmp_path, fe)
+
+    def test_removed_feature_raises(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        spec = {
+            "feature_names": fe.get_feature_names() + ["extra_feature"],
+            "categorical_features": fe.get_categorical_features(),
+        }
+        with open(tmp_path / "feature_spec.json", "w") as f:
+            json.dump(spec, f)
+        with pytest.raises(ValueError, match="Feature schema mismatch"):
+            check_feature_compatibility(tmp_path, fe)
+
+    def test_categorical_mismatch_raises(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        spec = {
+            "feature_names": fe.get_feature_names(),
+            "categorical_features": ["layout", "pipeline"],
+        }
+        with open(tmp_path / "feature_spec.json", "w") as f:
+            json.dump(spec, f)
+        with pytest.raises(ValueError, match="Categorical feature mismatch"):
+            check_feature_compatibility(tmp_path, fe)
+
+
+class TestLoadWarmStartModel:
+    def test_loads_existing_model(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        df = _make_dummy_data()
+        _train_and_save_base_model(tmp_path, df, fe)
+        path = load_warm_start_model(tmp_path, "tflops")
+        assert path is not None
+        assert Path(path).exists()
+
+    def test_returns_none_for_missing_target(self, tmp_path):
+        assert load_warm_start_model(tmp_path, "tflops") is None
+
+    def test_returns_none_for_wrong_target(self, tmp_path):
+        fe = GemmUniversalFeatureEngine()
+        df = _make_dummy_data()
+        _train_and_save_base_model(tmp_path, df, fe, target="tflops")
+        assert load_warm_start_model(tmp_path, "bandwidth") is None
+
+
+class TestWarmStartTraining:
+    def test_warm_start_produces_more_trees(self, tmp_path):
+        """A warm-started model should have more trees than the base."""
+        fe = GemmUniversalFeatureEngine()
+        df = _make_dummy_data(n_rows=300)
+
+        base_dir = tmp_path / "base"
+        base_dir.mkdir()
+        base_model = _train_and_save_base_model(base_dir, df, fe)
+        base_n_trees = base_model.booster_.num_trees()
+
+        init_model_path = load_warm_start_model(base_dir, "tflops")
+        params = dict(DEFAULT_PARAMS)
+        params["n_estimators"] = 15
+        params["n_jobs"] = 1
+        warm_model = train_final_model(
+            df, fe, "tflops", params, init_model=init_model_path
+        )
+        warm_n_trees = warm_model.booster_.num_trees()
+
+        assert warm_n_trees > base_n_trees
+
+    def test_warm_start_does_not_degrade(self, tmp_path):
+        """Warm-started model on the same data should not be significantly worse."""
+        fe = GemmUniversalFeatureEngine()
+        df = _make_dummy_data(n_rows=300)
+
+        base_dir = tmp_path / "base"
+        base_dir.mkdir()
+        base_model = _train_and_save_base_model(base_dir, df, fe)
+
+        X = fe.extract_batch(df[df["is_valid"]].reset_index(drop=True))
+        y = df[df["is_valid"]]["measured_tflops"].values
+        base_rmse = np.sqrt(np.mean((base_model.predict(X) - y) ** 2))
+
+        init_model_path = load_warm_start_model(base_dir, "tflops")
+        params = dict(DEFAULT_PARAMS)
+        params["n_estimators"] = 15
+        params["n_jobs"] = 1
+        warm_model = train_final_model(
+            df, fe, "tflops", params, init_model=init_model_path
+        )
+        warm_rmse = np.sqrt(np.mean((warm_model.predict(X) - y) ** 2))
+
+        assert warm_rmse <= base_rmse * 1.1
+
+    def test_warm_start_from_nonexistent_dir(self):
+        with pytest.raises(FileNotFoundError):
+            check_feature_compatibility(
+                Path("/nonexistent/model/dir"), GemmUniversalFeatureEngine()
+            )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/dispatcher/heuristics/train.py b/dispatcher/heuristics/train.py
new file mode 100644
index 0000000000..6d5dc772ac
--- /dev/null
+++ b/dispatcher/heuristics/train.py
@@ -0,0 +1,555 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Training script for CK Tile kernel performance prediction.
+
+Trains LGBMRegressor models (TFLOPS, latency, bandwidth) with:
+  - Log-space regression (log1p transform) for scale-invariant accuracy
+  - GroupKFold cross-validation (group key = (M, N, K))
+  - Iterative Hard Example Mining (IHEM)
+  - Model complexity bounds for C++ deployability
+  - Optional Optuna hyperparameter tuning
+  - Warm-start incremental training from a previous model via --warm_start
+
+Log-transform rationale:
+  GEMM TFLOPS spans 5 orders of magnitude (0.02 for M=1 to 2230 for large
+  shapes). Raw regression optimizes for absolute RMSE, which means the model
+  spends all its capacity predicting large shapes accurately and ignores tiny
+  shapes where TFLOPS is < 10. Training on log1p(TFLOPS) puts all shapes on
+  equal footing, improving tiny_m efficiency from 84% to 96%.
+"""
+
+import argparse
+import json
+import time
+from pathlib import Path
+
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import GroupKFold
+
+from data_pipeline import build_training_dataset
+from feature_engine import GemmUniversalFeatureEngine
+
+
+TARGET_COLUMNS = {
+    "tflops": "measured_tflops",
+    "latency": "latency_ms",
+    "bandwidth": "bandwidth_gb_s",
+}
+
+# Targets where log1p transform is applied by default.
+# TFLOPS and bandwidth span orders of magnitude; latency is already small-scale.
+LOG_TARGETS = {"tflops", "bandwidth"}
+
+DEFAULT_PARAMS = {
+    "objective": "regression",
+    "metric": ["rmse", "mae"],
+    "num_leaves": 255,
+    "max_depth": 15,
+    "n_estimators": 2000,
+    "learning_rate": 0.02,
+    "min_child_samples": 10,
+    "subsample": 0.85,
+    "colsample_bytree": 0.85,
+    "reg_alpha": 0.05,
+    "reg_lambda": 0.5,
+    "verbose": -1,
+    "n_jobs": 8,
+    "seed": 42,
+}
+
+MAX_ESTIMATORS = 5000
+WARM_START_N_ESTIMATORS = 500
+
+
+def check_feature_compatibility(
+    prev_model_dir: Path,
+    feature_engine: GemmUniversalFeatureEngine,
+) -> None:
+    """Verify that the previous model's feature spec matches the current engine.
+
+    Raises ValueError with a detailed message on mismatch. This prevents silent
+    corruption when warm-starting from a model trained with a different feature
+    schema (e.g., after adding a new feature or changing an encoding).
+    """
+    spec_path = prev_model_dir / "feature_spec.json"
+    if not spec_path.exists():
+        raise FileNotFoundError(
+            f"No feature_spec.json in {prev_model_dir}. "
+            "Cannot verify feature compatibility for warm start."
+        )
+
+    with open(spec_path) as f:
+        prev_spec = json.load(f)
+
+    prev_names = prev_spec.get("feature_names", [])
+    curr_names = feature_engine.get_feature_names()
+    if prev_names != curr_names:
+        added = set(curr_names) - set(prev_names)
+        removed = set(prev_names) - set(curr_names)
+        parts = ["Feature schema mismatch between previous model and current engine."]
+        if added:
+            parts.append(f"  Added features: {sorted(added)}")
+        if removed:
+            parts.append(f"  Removed features: {sorted(removed)}")
+        if not added and not removed:
+            parts.append("  Feature order changed (names match but order differs).")
+        raise ValueError("\n".join(parts))
+
+    prev_cats = prev_spec.get("categorical_features", [])
+    curr_cats = feature_engine.get_categorical_features()
+    if sorted(prev_cats) != sorted(curr_cats):
+        raise ValueError(
+            f"Categorical feature mismatch.\n"
+            f"  Previous: {sorted(prev_cats)}\n"
+            f"  Current:  {sorted(curr_cats)}"
+        )
+
+
+def load_warm_start_model(prev_model_dir: Path, target: str) -> str | None:
+    """Load the path to a previous model file for warm-start, or None if absent.
+
+    Automatically decompresses .lgbm.gz files if the .lgbm file doesn't exist.
+    The decompressed file is cached to disk for subsequent loads.
+
+    Returns the string path (what LightGBM's init_model expects) rather than
+    a loaded Booster, because LGBMRegressor.fit(init_model=...) accepts both
+    path strings and Booster objects and path strings avoid keeping the old
+    model in memory.
+    """
+    import gzip
+
+    model_path = prev_model_dir / f"model_{target}.lgbm"
+    gz_path = prev_model_dir / f"model_{target}.lgbm.gz"
+
+    # Auto-decompress if needed
+    if not model_path.exists() and gz_path.exists():
+        print(f"  Decompressing {gz_path.name}...")
+        with gzip.open(gz_path, "rb") as f_in:
+            with open(model_path, "wb") as f_out:
+                f_out.write(f_in.read())
+
+    if not model_path.exists():
+        return None
+    return str(model_path)
+
+
+def compute_group_keys(df: pd.DataFrame) -> np.ndarray:
+    """Create GroupKFold group keys from (M, N, K)."""
+    return (
+        df["m"].astype(str) + "_" + df["n"].astype(str) + "_" + df["k"].astype(str)
+    ).values
+
+
+def compute_tflops_efficiency(
+    df: pd.DataFrame, pred_col: str = "pred_tflops"
+) -> pd.DataFrame:
+    """Compute per-shape efficiency: predicted-best TFLOPS / oracle-best TFLOPS."""
+    results = []
+    for (m, n, k), group in df.groupby(["m", "n", "k"]):
+        oracle_best = group["measured_tflops"].max()
+        if oracle_best <= 0:
+            continue
+        pred_best_idx = group[pred_col].idxmax()
+        selected_tflops = group.loc[pred_best_idx, "measured_tflops"]
+        efficiency = selected_tflops / oracle_best
+        results.append(
+            {
+                "m": m,
+                "n": n,
+                "k": k,
+                "oracle_best_tflops": oracle_best,
+                "selected_tflops": selected_tflops,
+                "efficiency": efficiency,
+            }
+        )
+    return pd.DataFrame(results)
+
+
+def train_single_target(
+    X_train,
+    y_train,
+    X_val,
+    y_val,
+    params: dict,
+    categorical_features: list[str],
+    feature_names: list[str],
+    init_model=None,
+) -> lgb.LGBMRegressor:
+    """Train a single LGBMRegressor with early stopping.
+
+    Parameters
+    ----------
+    init_model : str, Path, lgb.Booster, lgb.LGBMModel, or None
+        If provided, training continues from this model (warm start).
+        Accepts a file path to a .lgbm file, a Booster instance, or an
+        LGBMModel instance. The new model adds n_estimators trees on top
+        of the existing ones.
+    """
+    cat_indices = [
+        feature_names.index(c) for c in categorical_features if c in feature_names
+    ]
+
+    model = lgb.LGBMRegressor(**params)
+    model.fit(
+        X_train,
+        y_train,
+        eval_set=[(X_val, y_val)],
+        eval_metric=["rmse"],
+        callbacks=[
+            lgb.early_stopping(50, verbose=False),
+            lgb.log_evaluation(0),
+        ],
+        categorical_feature=cat_indices if cat_indices else "auto",
+        init_model=init_model,
+    )
+    return model
+
+
+def run_cv(
+    df: pd.DataFrame,
+    feature_engine: GemmUniversalFeatureEngine,
+    target: str,
+    params: dict,
+    n_splits: int = 5,
+    use_log: bool = True,
+) -> dict:
+    """Run GroupKFold cross-validation and return OOF predictions + metrics.
+
+    Parameters
+    ----------
+    use_log : bool
+        If True and target is in LOG_TARGETS, train on log1p(y) and invert
+        predictions with expm1 for efficiency calculation. This normalizes
+        the scale so that tiny-M shapes (TFLOPS ~ 1) get equal attention
+        as large-M shapes (TFLOPS ~ 2000).
+    """
+    target_col = TARGET_COLUMNS[target]
+    valid_mask = df["is_valid"].fillna(False) & (df[target_col] > 0)
+    df_valid = df[valid_mask].reset_index(drop=True)
+
+    apply_log = use_log and target in LOG_TARGETS
+
+    print(
+        f"  Training on {len(df_valid)} valid rows for target={target}"
+        f"{' (log-space)' if apply_log else ''}"
+    )
+
+    X = feature_engine.extract_batch(df_valid)
+    y_raw = df_valid[target_col].values
+    y = np.log1p(y_raw) if apply_log else y_raw
+    groups = compute_group_keys(df_valid)
+    feature_names = feature_engine.get_feature_names()
+    cat_features = feature_engine.get_categorical_features()
+
+    unique_groups = np.unique(groups)
+    actual_splits = min(n_splits, len(unique_groups))
+    if actual_splits < 2:
+        print(f"  WARNING: Only {len(unique_groups)} unique groups, skipping CV")
+        return {}
+
+    gkf = GroupKFold(n_splits=actual_splits)
+    oof_preds = np.zeros(len(df_valid))
+    fold_metrics = []
+
+    for fold_idx, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
+        X_tr, X_val = X[train_idx], X[val_idx]
+        y_tr, y_val = y[train_idx], y[val_idx]
+
+        model = train_single_target(
+            X_tr, y_tr, X_val, y_val, params, cat_features, feature_names
+        )
+        preds = model.predict(X_val)
+        oof_preds[val_idx] = preds
+
+        rmse = np.sqrt(np.mean((preds - y_val) ** 2))
+        r2 = 1 - np.sum((preds - y_val) ** 2) / max(
+            np.sum((y_val - y_val.mean()) ** 2), 1e-10
+        )
+
+        if target == "tflops":
+            val_df = df_valid.iloc[val_idx].copy()
+            preds_raw = np.expm1(preds) if apply_log else preds
+            val_df["pred_tflops"] = preds_raw
+            eff_df = compute_tflops_efficiency(val_df)
+            mean_eff = eff_df["efficiency"].mean() if len(eff_df) > 0 else 0
+            p10_eff = eff_df["efficiency"].quantile(0.1) if len(eff_df) > 0 else 0
+        else:
+            mean_eff, p10_eff = None, None
+
+        fold_metrics.append(
+            {
+                "fold": fold_idx,
+                "rmse": rmse,
+                "r2": r2,
+                "mean_efficiency": mean_eff,
+                "p10_efficiency": p10_eff,
+                "train_size": len(train_idx),
+                "val_size": len(val_idx),
+                "val_groups": len(np.unique(groups[val_idx])),
+            }
+        )
+
+        eff_str = (
+            f", eff={mean_eff:.4f}, p10={p10_eff:.4f}" if mean_eff is not None else ""
+        )
+        print(f"    Fold {fold_idx}: RMSE={rmse:.4f}, R2={r2:.4f}{eff_str}")
+
+    df_valid[f"oof_pred_{target}"] = oof_preds
+
+    return {
+        "fold_metrics": fold_metrics,
+        "oof_df": df_valid,
+        "feature_names": feature_names,
+        "log_transform": apply_log,
+    }
+
+
+def train_final_model(
+    df: pd.DataFrame,
+    feature_engine: GemmUniversalFeatureEngine,
+    target: str,
+    params: dict,
+    init_model=None,
+    use_log: bool = True,
+) -> lgb.LGBMRegressor:
+    """Train the final model on all valid data.
+
+    Parameters
+    ----------
+    init_model : str, Path, lgb.Booster, lgb.LGBMModel, or None
+        If provided, training continues from this model (warm start).
+    use_log : bool
+        If True and target is in LOG_TARGETS, train on log1p(y).
+        The saved model then predicts in log-space; callers must apply
+        expm1() to get raw values.
+    """
+    target_col = TARGET_COLUMNS[target]
+    valid_mask = df["is_valid"].fillna(False) & (df[target_col] > 0)
+    df_valid = df[valid_mask].reset_index(drop=True)
+
+    apply_log = use_log and target in LOG_TARGETS
+
+    X = feature_engine.extract_batch(df_valid)
+    y_raw = df_valid[target_col].values
+    y = np.log1p(y_raw) if apply_log else y_raw
+    feature_names = feature_engine.get_feature_names()
+    cat_features = feature_engine.get_categorical_features()
+    cat_indices = [feature_names.index(c) for c in cat_features if c in feature_names]
+
+    model = lgb.LGBMRegressor(**params)
+    model.fit(
+        X,
+        y,
+        categorical_feature=cat_indices if cat_indices else "auto",
+        init_model=init_model,
+    )
+    return model
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Train CK Tile kernel performance models"
+    )
+    parser.add_argument(
+        "--data_dir", required=True, help="Directory with parquet files"
+    )
+    parser.add_argument("--out_dir", required=True, help="Output directory for models")
+    parser.add_argument("--op", default="gemm_universal", help="Operation type")
+    parser.add_argument("--dtype", default="fp8", help="Data type filter")
+    parser.add_argument("--arch", default="gfx950", help="Architecture")
+    parser.add_argument(
+        "--targets", default="tflops,latency,bandwidth", help="Comma-separated targets"
+    )
+    parser.add_argument("--n_splits", type=int, default=5, help="Number of CV folds")
+    parser.add_argument(
+        "--tune", action="store_true", help="Run Optuna hyperparameter tuning"
+    )
+    parser.add_argument(
+        "--no_log_transform",
+        action="store_true",
+        help="Disable log1p transform on targets. By default, TFLOPS and bandwidth "
+        "are trained in log-space for scale-invariant accuracy across shape sizes.",
+    )
+    parser.add_argument(
+        "--warm_start",
+        default=None,
+        help="Path to previous model directory to continue training from. "
+        "Uses LightGBM's init_model to add new trees on top of the "
+        "existing model. Feature schemas must match exactly.",
+    )
+    parser.add_argument(
+        "--warm_start_n_estimators",
+        type=int,
+        default=WARM_START_N_ESTIMATORS,
+        help=f"Number of new trees to add when warm-starting (default: {WARM_START_N_ESTIMATORS}). "
+        "Lower than a full train since we're refining, not starting from scratch.",
+    )
+    args = parser.parse_args()
+
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    targets = [t.strip() for t in args.targets.split(",")]
+
+    print(f"Loading data from {args.data_dir}...")
+    df = build_training_dataset(args.data_dir, op_type=args.op, dtype=args.dtype)
+    print(f"  Total rows: {len(df)}")
+    print(f"  Unique shapes: {df.groupby(['m', 'n', 'k']).ngroups}")
+    print(f"  Unique kernels: {df['kernel_name'].nunique()}")
+
+    hw_cols = [c for c in df.columns if c.startswith("hw_")]
+    hw_kwargs = {}
+    if hw_cols:
+        row0 = df.iloc[0]
+        if "hw_num_cus" in df.columns:
+            hw_kwargs["num_cus"] = int(row0.get("hw_num_cus", 256))
+        if "hw_max_clock_mhz" in df.columns:
+            hw_kwargs["max_clock_mhz"] = int(row0.get("hw_max_clock_mhz", 2400))
+        if "hw_simds_per_cu" in df.columns:
+            hw_kwargs["simds_per_cu"] = int(row0.get("hw_simds_per_cu", 4))
+        if "hw_shader_engines" in df.columns:
+            hw_kwargs["shader_engines"] = int(row0.get("hw_shader_engines", 32))
+        if "hw_max_waves_per_cu" in df.columns:
+            hw_kwargs["max_waves_per_cu"] = int(row0.get("hw_max_waves_per_cu", 32))
+        if "hw_wavefront_size" in df.columns:
+            hw_kwargs["wavefront_size"] = int(row0.get("hw_wavefront_size", 64))
+        if "hw_l1_cache_kb" in df.columns:
+            hw_kwargs["l1_cache_kb"] = int(row0.get("hw_l1_cache_kb", 32))
+        if "hw_l2_cache_kb" in df.columns:
+            hw_kwargs["l2_cache_kb"] = int(row0.get("hw_l2_cache_kb", 4096))
+        if "hw_l3_cache_kb" in df.columns:
+            hw_kwargs["l3_cache_kb"] = int(row0.get("hw_l3_cache_kb", 262144))
+
+    fe = GemmUniversalFeatureEngine(**hw_kwargs)
+
+    params = dict(DEFAULT_PARAMS)
+    use_log = not args.no_log_transform
+
+    prev_model_dir = None
+    prev_manifest = {}
+    if args.warm_start:
+        prev_model_dir = Path(args.warm_start)
+        if not prev_model_dir.exists():
+            raise FileNotFoundError(f"Warm-start directory not found: {prev_model_dir}")
+        print(f"  Warm-starting from {prev_model_dir}")
+        check_feature_compatibility(prev_model_dir, fe)
+        print("  Feature compatibility: OK")
+        params["n_estimators"] = args.warm_start_n_estimators
+        print(f"  New trees to add: {args.warm_start_n_estimators}")
+
+        prev_manifest_path = prev_model_dir / "train_manifest.json"
+        if prev_manifest_path.exists():
+            with open(prev_manifest_path) as f:
+                prev_manifest = json.load(f)
+
+    all_cv_results = {}
+    for target in targets:
+        if target not in TARGET_COLUMNS:
+            print(f"  Skipping unknown target: {target}")
+            continue
+
+        print(f"\n{'=' * 60}")
+        print(f"Training {target} model")
+        print(f"{'=' * 60}")
+
+        init_model_path = None
+        if prev_model_dir is not None:
+            init_model_path = load_warm_start_model(prev_model_dir, target)
+            if init_model_path:
+                print(f"  Warm-starting from {init_model_path}")
+            else:
+                print(f"  No previous {target} model found, training from scratch")
+
+        t0 = time.time()
+        cv_result = run_cv(
+            df, fe, target, params, n_splits=args.n_splits, use_log=use_log
+        )
+        cv_time = time.time() - t0
+
+        if cv_result and cv_result["fold_metrics"]:
+            all_cv_results[target] = cv_result["fold_metrics"]
+            metrics_path = out_dir / f"cv_metrics_{target}.json"
+            with open(metrics_path, "w") as f:
+                json.dump(cv_result["fold_metrics"], f, indent=2)
+            print(f"  CV completed in {cv_time:.1f}s, saved to {metrics_path}")
+
+            if target == "tflops" and cv_result.get("oof_df") is not None:
+                oof_df = cv_result["oof_df"]
+                oof_df.to_parquet(out_dir / "oof_predictions.parquet", index=False)
+
+                eff_df = compute_tflops_efficiency(oof_df, "oof_pred_tflops")
+                if len(eff_df) > 0:
+                    print("\n  OOF TFLOPS Efficiency:")
+                    print(f"    Mean: {eff_df['efficiency'].mean():.4f}")
+                    print(f"    P10:  {eff_df['efficiency'].quantile(0.1):.4f}")
+                    print(f"    P50:  {eff_df['efficiency'].quantile(0.5):.4f}")
+                    print(f"    Min:  {eff_df['efficiency'].min():.4f}")
+
+        print(f"\n  Training final {target} model on all data...")
+        t0 = time.time()
+        model = train_final_model(
+            df, fe, target, params, init_model=init_model_path, use_log=use_log
+        )
+        train_time = time.time() - t0
+
+        model_path = out_dir / f"model_{target}.lgbm"
+        model.booster_.save_model(str(model_path))
+        print(f"  Saved {model_path} ({train_time:.1f}s)")
+
+        importances = dict(
+            zip(
+                fe.get_feature_names(),
+                model.feature_importances_.tolist(),
+            )
+        )
+        imp_path = out_dir / f"feature_importances_{target}.json"
+        with open(imp_path, "w") as f:
+            json.dump(importances, f, indent=2)
+
+    log_targets_used = sorted(LOG_TARGETS & set(targets)) if use_log else []
+    spec = {
+        "op_type": args.op,
+        "dtype": args.dtype,
+        "arch": args.arch,
+        "feature_names": fe.get_feature_names(),
+        "categorical_features": fe.get_categorical_features(),
+        "targets": targets,
+        "log_targets": log_targets_used,
+        "params": params,
+    }
+    with open(out_dir / "feature_spec.json", "w") as f:
+        json.dump(spec, f, indent=2)
+
+    manifest = {
+        "warm_start_from": str(prev_model_dir) if prev_model_dir else None,
+        "prev_n_estimators": prev_manifest.get(
+            "total_n_estimators", params.get("n_estimators")
+        )
+        if prev_model_dir
+        else 0,
+        "new_n_estimators": params["n_estimators"],
+        "total_n_estimators": (
+            prev_manifest.get("total_n_estimators", 0) + params["n_estimators"]
+            if prev_model_dir
+            else params["n_estimators"]
+        ),
+        "data_rows": len(df),
+        "valid_rows": int(df["is_valid"].fillna(False).sum()),
+        "unique_shapes": int(df.groupby(["m", "n", "k"]).ngroups),
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
+    }
+    with open(out_dir / "train_manifest.json", "w") as f:
+        json.dump(manifest, f, indent=2)
+
+    print(f"\nAll models saved to {out_dir}")
+    if prev_model_dir:
+        print(f"  Warm-started from: {prev_model_dir}")
+        print(f"  Total estimators: {manifest['total_n_estimators']}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dispatcher/heuristics/validate_ml_heuristic.py b/dispatcher/heuristics/validate_ml_heuristic.py
new file mode 100644
index 0000000000..ccd7a20cd9
--- /dev/null
+++ b/dispatcher/heuristics/validate_ml_heuristic.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+ML Heuristic Validation: Test ML predictions against oracle-best from training data
+
+This script validates ML-based kernel selection by:
+1. Loading benchmark data (oracle-best results for each shape)
+2. Using ML model to predict best kernel for each shape
+3. Comparing ML selection with oracle-best to compute efficiency
+
+Usage:
+    python validate_ml_heuristic.py --dtype fp16 --model_dir models/gemm_universal_fp16_gfx950
+    python validate_ml_heuristic.py --dtype fp8 --layout rcr
+"""
+
+import sys
+import argparse
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+from predict import Predictor
+
+
+def validate_ml_heuristic(dtype: str, layout: str, model_dir: str, data_dir: str):
+    """Validate ML heuristic predictions against oracle-best"""
+
+    print("=" * 100)
+    print(f"  ML Heuristic Validation: {dtype.upper()} {layout.upper()}")
+    print("=" * 100)
+    print()
+
+    # Load training data
+    print(f"Loading training data from {data_dir}...")
+
+    # Try dtype-specific parquet first, then fall back to combined
+    dtype_specific = (
+        Path(data_dir) / f"{dtype}_original" / f"{dtype}_training_data.parquet"
+    )
+    combined = Path(data_dir) / "all_training_data_fixed.parquet"
+
+    if dtype_specific.exists():
+        training_data = pd.read_parquet(dtype_specific)
+        print(f"✓ Loaded {len(training_data):,} benchmark runs from {dtype_specific}")
+    elif combined.exists():
+        training_data = pd.read_parquet(combined)
+        training_data = training_data[
+            (training_data["dtype"] == dtype) & (training_data["layout"] == layout)
+        ]
+        print(f"✓ Loaded {len(training_data):,} benchmark runs from {combined}")
+    else:
+        print(f"❌ Error: No training data found at {dtype_specific} or {combined}")
+        return
+
+    if len(training_data) == 0:
+        print(f"❌ Error: No data found for dtype={dtype}, layout={layout}")
+        return
+
+    # Get unique shapes with oracle-best
+    shape_groups = training_data.groupby(["m", "n", "k"])
+    print(f"Unique shapes: {len(shape_groups)}")
+    print()
+
+    # Load ML predictor
+    print(f"Loading ML predictor from {model_dir}...")
+    try:
+        predictor = Predictor(model_dir)
+        print("✓ Loaded ML predictor")
+        print(f"  Log targets: {predictor._log_targets}")
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        return
+
+    print()
+    print("=" * 100)
+    print("  Computing Oracle-Best Efficiency for Each Shape")
+    print("=" * 100)
+    print()
+
+    results = []
+
+    for shape_idx, ((m, n, k), group) in enumerate(shape_groups):
+        # Find oracle-best (max TFLOPS across all kernels tested)
+        oracle_best_row = group.loc[group["measured_tflops"].idxmax()]
+        oracle_best_tflops = oracle_best_row["measured_tflops"]
+        oracle_best_kernel = oracle_best_row["kernel_name"]
+
+        # Get all kernel configs tested for this shape
+        kernel_configs = []
+        for _, row in group.iterrows():
+            kernel_dict = {
+                "tile_m": row["tile_m"],
+                "tile_n": row["tile_n"],
+                "tile_k": row["tile_k"],
+                "warp_m": row["warp_m"],
+                "warp_n": row["warp_n"],
+                "warp_k": row["warp_k"],
+                "warp_tile_m": row["warp_tile_m"],
+                "warp_tile_n": row["warp_tile_n"],
+                "warp_tile_k": row["warp_tile_k"],
+                "pipeline": row["pipeline"],
+                "scheduler": row["scheduler"],
+                "epilogue": row["epilogue"],
+                "pad_m": row["pad_m"],
+                "pad_n": row["pad_n"],
+                "pad_k": row["pad_k"],
+                "persistent": row["persistent"],
+                "kernel_name": row["kernel_name"],
+            }
+            kernel_configs.append(kernel_dict)
+
+        # Use ML model to rank kernels
+        problem = {
+            "m": m,
+            "n": n,
+            "k": k,
+            "dtype": dtype,
+            "layout": layout,
+            "split_k": 1,
+        }
+
+        try:
+            ranked = predictor.rank_kernels(problem, kernel_configs)
+
+            if ranked:
+                ml_best_kernel, ml_predicted_tflops = ranked[0]
+
+                # Find actual TFLOPS for the ML-predicted kernel
+                ml_kernel_row = group[group["kernel_name"] == ml_best_kernel]
+                if len(ml_kernel_row) > 0:
+                    ml_actual_tflops = ml_kernel_row["measured_tflops"].values[0]
+
+                    # Calculate efficiency
+                    efficiency_pct = 100.0 * (ml_actual_tflops / oracle_best_tflops)
+
+                    # Determine if ML picked oracle-best
+                    is_oracle_best = ml_best_kernel == oracle_best_kernel
+
+                    results.append(
+                        {
+                            "m": m,
+                            "n": n,
+                            "k": k,
+                            "oracle_best_tflops": oracle_best_tflops,
+                            "oracle_best_kernel": oracle_best_kernel,
+                            "ml_predicted_tflops": ml_predicted_tflops,
+                            "ml_selected_kernel": ml_best_kernel,
+                            "ml_actual_tflops": ml_actual_tflops,
+                            "efficiency_pct": efficiency_pct,
+                            "is_oracle_best": is_oracle_best,
+                            "num_kernels": len(group),
+                        }
+                    )
+
+                    if (shape_idx + 1) % 20 == 0:
+                        status = "✓" if is_oracle_best else f"{efficiency_pct:.1f}%"
+                        print(
+                            f"  [{shape_idx + 1:3d}/{len(shape_groups)}] "
+                            f"M={m:4d} N={n:5d} K={k:5d}: {status}"
+                        )
+        except Exception as e:
+            print(f"  Error on shape M={m} N={n} K={k}: {e}")
+            continue
+
+    print()
+    print("=" * 100)
+    print("  Results Summary")
+    print("=" * 100)
+    print()
+
+    if results:
+        df_results = pd.DataFrame(results)
+        efficiencies = df_results["efficiency_pct"].values
+        oracle_matches = df_results["is_oracle_best"].sum()
+
+        print(f"Total shapes tested: {len(results)}")
+        print()
+        print("Efficiency Statistics (% of Oracle-Best TFLOPS):")
+        print(f"  Mean:           {np.mean(efficiencies):.2f}%")
+        print(f"  Median:         {np.median(efficiencies):.2f}%")
+        print(f"  Min:            {np.min(efficiencies):.2f}%")
+        print(f"  Max:            {np.max(efficiencies):.2f}%")
+        print(f"  P10:            {np.percentile(efficiencies, 10):.2f}%")
+        print(f"  P50:            {np.percentile(efficiencies, 50):.2f}%")
+        print(f"  P90:            {np.percentile(efficiencies, 90):.2f}%")
+        print()
+        print(
+            f"Oracle-best matches: {oracle_matches}/{len(results)} ({100 * oracle_matches / len(results):.1f}%)"
+        )
+        print()
+
+        # Classify by M size
+        df_results["m_class"] = pd.cut(
+            df_results["m"],
+            bins=[0, 8, 128, 1024, float("inf")],
+            labels=[
+                "Tiny (M<8)",
+                "Small (8≤M<128)",
+                "Medium (128≤M<1024)",
+                "Large (M≥1024)",
+            ],
+        )
+
+        print("Efficiency by M size:")
+        for m_class in [
+            "Tiny (M<8)",
+            "Small (8≤M<128)",
+            "Medium (128≤M<1024)",
+            "Large (M≥1024)",
+        ]:
+            subset = df_results[df_results["m_class"] == m_class]
+            if len(subset) > 0:
+                print(
+                    f"  {m_class:25s}: {subset['efficiency_pct'].mean():6.2f}% "
+                    f"(n={len(subset)}, P10={subset['efficiency_pct'].quantile(0.1):.2f}%)"
+                )
+
+        print()
+
+        # Save results
+        output_file = f"validation_results_{dtype}_{layout}.csv"
+        df_results.to_csv(output_file, index=False)
+        print(f"✓ Results saved to {output_file}")
+
+        # Show best and worst shapes
+        print()
+        print("Top 5 shapes (best efficiency):")
+        top5 = df_results.nlargest(5, "efficiency_pct")[
+            ["m", "n", "k", "efficiency_pct", "oracle_best_tflops", "is_oracle_best"]
+        ]
+        for idx, row in top5.iterrows():
+            match = "✓" if row["is_oracle_best"] else " "
+            print(
+                f"  {match} M={row['m']:5d} N={row['n']:5d} K={row['k']:5d}: "
+                f"{row['efficiency_pct']:.2f}% ({row['oracle_best_tflops']:.2f} TFLOPS)"
+            )
+
+        print()
+        print("Bottom 5 shapes (worst efficiency):")
+        bottom5 = df_results.nsmallest(5, "efficiency_pct")[
+            ["m", "n", "k", "efficiency_pct", "oracle_best_tflops", "is_oracle_best"]
+        ]
+        for idx, row in bottom5.iterrows():
+            match = "✓" if row["is_oracle_best"] else " "
+            print(
+                f"  {match} M={row['m']:5d} N={row['n']:5d} K={row['k']:5d}: "
+                f"{row['efficiency_pct']:.2f}% ({row['oracle_best_tflops']:.2f} TFLOPS)"
+            )
+
+    else:
+        print("No results to display")
+
+    print()
+    print("=" * 100)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Validate ML heuristic predictions against oracle-best from training data"
+    )
+    parser.add_argument(
+        "--dtype",
+        default="fp16",
+        choices=["fp16", "bf16", "fp8"],
+        help="Data type to validate",
+    )
+    parser.add_argument(
+        "--layout",
+        default="rcr",
+        choices=["rcr", "rrr", "crr", "ccr"],
+        help="Matrix layout",
+    )
+    parser.add_argument(
+        "--model_dir",
+        default=None,
+        help="Path to model directory (auto-detect if not specified)",
+    )
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        help="Path to training data directory (auto-detect if not specified)",
+    )
+
+    args = parser.parse_args()
+
+    # Auto-detect model directory if not specified
+    if args.model_dir is None:
+        heuristics_dir = Path(__file__).parent
+        model_candidates = [
+            heuristics_dir / "models" / f"gemm_universal_{args.dtype}_gfx950",
+            heuristics_dir / "models" / f"gemm_universal_{args.dtype}_gfx942",
+        ]
+        for candidate in model_candidates:
+            if candidate.exists():
+                args.model_dir = str(candidate)
+                break
+
+        if args.model_dir is None:
+            print(f"❌ Error: Could not find model directory for {args.dtype}")
+            print(f"   Searched: {[str(c) for c in model_candidates]}")
+            print("   Please specify --model_dir explicitly")
+            return 1
+
+    # Auto-detect data directory if not specified
+    if args.data_dir is None:
+        heuristics_dir = Path(__file__).parent
+        args.data_dir = str(heuristics_dir / "data")
+
+    validate_ml_heuristic(args.dtype, args.layout, args.model_dir, args.data_dir)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp b/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp
index f49b3a0d74..f5a93c6d34 100644
--- a/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp
@@ -140,6 +140,11 @@ struct KernelKey
         bool preshuffle;              // Preshuffle (for weight preshuffle variants)
         bool transpose_c;             // TransposeC
         std::uint8_t num_wave_groups; // NumWaveGroups
+
+        // Padding support flags (kPadM, kPadN, kPadK in generated kernels)
+        bool pad_m = true; // Support arbitrary M dimensions via padding
+        bool pad_n = true; // Support arbitrary N dimensions via padding
+        bool pad_k = true; // Support arbitrary K dimensions via padding
     } algorithm;
 
     std::string gfx_arch; // e.g. "gfx942", "gfx90a", "gfx908"
@@ -185,7 +190,10 @@ struct KernelKey
                         algorithm.double_buffer,
                         algorithm.preshuffle,
                         algorithm.transpose_c,
-                        algorithm.num_wave_groups);
+                        algorithm.num_wave_groups,
+                        algorithm.pad_m,
+                        algorithm.pad_n,
+                        algorithm.pad_k);
     }
 
     /// Equality comparison
@@ -397,8 +405,14 @@ inline std::string KernelKey::encode_identifier() const
 
     // Include pipeline, scheduler, epilogue for uniqueness
     oss << to_string(algorithm.pipeline) << "_";
-    oss << to_string(algorithm.scheduler) << "_";
     oss << to_string(algorithm.epilogue) << "_";
+    oss << to_string(algorithm.scheduler) << "_";
+
+    // Match tile_engine naming: padding flags (True/False) then persistent flag
+    oss << (algorithm.pad_m ? "True" : "False") << "_";
+    oss << (algorithm.pad_n ? "True" : "False") << "_";
+    oss << (algorithm.pad_k ? "True" : "False") << "_";
+    oss << (algorithm.persistent ? "True" : "False") << "_";
 
     // Match tile_engine naming: tile_m x tile_n x tile_k _ warp_m x warp_n x warp_k _
     // warp_tile_m x warp_tile_n x warp_tile_k
@@ -407,9 +421,6 @@ inline std::string KernelKey::encode_identifier() const
         << unsigned(algorithm.wave_shape.k) << "_" << unsigned(algorithm.warp_tile_shape.m) << "x"
         << unsigned(algorithm.warp_tile_shape.n) << "x" << unsigned(algorithm.warp_tile_shape.k);
 
-    // Add trait flags
-    oss << "_" << (algorithm.persistent ? "persist" : "nopers");
-
     if(signature.split_k > 1)
         oss << "_splitk" << unsigned(signature.split_k);
     if(!signature.elementwise_op.empty() && signature.elementwise_op != "PassThrough")
diff --git a/dispatcher/include/ck_tile/dispatcher/ml_heuristic.hpp b/dispatcher/include/ck_tile/dispatcher/ml_heuristic.hpp
new file mode 100644
index 0000000000..359d772735
--- /dev/null
+++ b/dispatcher/include/ck_tile/dispatcher/ml_heuristic.hpp
@@ -0,0 +1,379 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+#include "ck_tile/dispatcher/dispatcher.hpp"
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include "ck_tile/dispatcher/registry.hpp"
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+namespace ck_tile {
+namespace dispatcher {
+extern "C" {
+int LGBM_BoosterCreateFromModelfile(const char*, int*, void**);
+int LGBM_BoosterPredictForMat(
+    void*, const void*, int, int, int, int, int, int, int, const char*, int64_t*, double*);
+int LGBM_BoosterFree(void*);
+}
+inline int encode_pipeline(Pipeline p)
+{
+    switch(p)
+    {
+    case Pipeline::CompV3: return 0;
+    case Pipeline::CompV4: return 1;
+    case Pipeline::CompV5: return 2;
+    case Pipeline::Mem: return 3;
+    case Pipeline::PreShuffleV2: return 4;
+    default: return 0;
+    }
+}
+inline int encode_scheduler(Scheduler s)
+{
+    switch(s)
+    {
+    case Scheduler::Intrawave: return 0;
+    case Scheduler::Interwave: return 1;
+    default: return 0;
+    }
+}
+inline int encode_epilogue(Epilogue e)
+{
+    switch(e)
+    {
+    case Epilogue::Default: return 0;
+    case Epilogue::CShuffle: return 1;
+    default: return 0;
+    }
+}
+inline int encode_layout(LayoutTag a, LayoutTag b, LayoutTag c)
+{
+    bool ra = (a == LayoutTag::RowMajor), rb = (b == LayoutTag::RowMajor);
+    if(ra && !rb)
+        return 0; // RCR
+    if(ra && rb)
+        return 1; // RRR
+    if(!ra && rb)
+        return 2; // CCR
+    return 3;     // CRR
+}
+inline double dtype_bytes_ml(DataType dt)
+{
+    switch(dt)
+    {
+    case DataType::FP32: return 4;
+    case DataType::FP16:
+    case DataType::BF16: return 2;
+    case DataType::FP8:
+    case DataType::BF8:
+    case DataType::INT8: return 1;
+    case DataType::INT4: return 0.5;
+    default: return 2;
+    }
+}
+struct HardwareProfile
+{
+    int num_cus = 256, simds_per_cu = 4, shader_engines = 32, max_clock_mhz = 2400,
+        max_waves_per_cu = 32, wavefront_size = 64, lds_capacity = 65536, l1_cache_kb = 32,
+        l2_cache_kb = 4096, l3_cache_kb = 262144, num_xcd = 8;
+    int total_simds() const { return num_cus * simds_per_cu; }
+};
+
+// CRITICAL: Feature count MUST match feature_spec.json
+// Python training uses 72 features - this header MUST extract exactly 72 features in the same order
+static constexpr int NUM_FEATURES = 72;
+
+inline std::array<double, NUM_FEATURES>
+extract_features(const Problem& prob, const KernelKey& key, const HardwareProfile& hw)
+{
+    // Problem dimensions
+    double M = prob.M, N = prob.N, K = prob.K;
+    double sk  = (prob.k_batch > 0 ? prob.k_batch : 1);
+    double bpe = dtype_bytes_ml(key.signature.dtype_a);
+
+    // Log-scale features
+    double l2M   = std::log2(std::max(M, 1.0));
+    double l2N   = std::log2(std::max(N, 1.0));
+    double l2K   = std::log2(std::max(K, 1.0));
+    double l2MNK = std::log2(std::max(M * N * K, 1.0));
+
+    // Arithmetic intensity
+    double mem = (M * K + K * N + M * N) * bpe;
+    double ai  = 2.0 * M * N * K / std::max(mem, 1.0);
+
+    // Aspect ratios
+    double ar_mn = M / std::max(N, 1.0);
+    double ar_mk = M / std::max(K, 1.0);
+    double ar_nk = N / std::max(K, 1.0);
+
+    // Layout encoding
+    double layout = (double)encode_layout(
+        key.signature.layout_a, key.signature.layout_b, key.signature.layout_c);
+
+    // Tile dimensions
+    double tm = key.algorithm.tile_shape.m;
+    double tn = key.algorithm.tile_shape.n;
+    double tk = key.algorithm.tile_shape.k;
+
+    // Wave/warp dimensions
+    double wm = key.algorithm.wave_shape.m;
+    double wn = key.algorithm.wave_shape.n;
+    double wk = key.algorithm.wave_shape.k;
+
+    // Warp tile dimensions
+    double wtm = key.algorithm.warp_tile_shape.m;
+    double wtn = key.algorithm.warp_tile_shape.n;
+    double wtk = key.algorithm.warp_tile_shape.k;
+
+    // Algorithm encoding
+    double pipeline  = (double)encode_pipeline(key.algorithm.pipeline);
+    double scheduler = (double)encode_scheduler(key.algorithm.scheduler);
+    double epilogue  = (double)encode_epilogue(key.algorithm.epilogue);
+
+    // Padding flags - read from KernelKey
+    double pad_m = key.algorithm.pad_m ? 1.0 : 0.0;
+    double pad_n = key.algorithm.pad_n ? 1.0 : 0.0;
+    double pad_k = key.algorithm.pad_k ? 1.0 : 0.0;
+
+    // Persistent kernel flag
+    double persistent = key.algorithm.persistent ? 1.0 : 0.0;
+
+    // Derived features
+    double num_warps   = wm * wn * wk;
+    double tile_volume = tm * tn * tk;
+    double tile_mn     = tm * tn;
+
+    // LDS usage estimation
+    double lest = (tm * tk + tn * tk) * bpe;
+    double lcap = (key.algorithm.pipeline == Pipeline::CompV4) ? 32768.0 : (double)hw.lds_capacity;
+    double lds_ratio = lest / std::max(lcap, 1.0);
+
+    // Tile counts
+    double ntm                = std::ceil(M / std::max(tm, 1.0));
+    double ntn                = std::ceil(N / std::max(tn, 1.0));
+    double ntk                = std::ceil(K / std::max(tk, 1.0));
+    double total_output_tiles = ntm * ntn;
+
+    // Tile efficiency (fractional remainder utilization)
+    auto ef = [](double d, double t) -> double {
+        if(t <= 0)
+            return 1.0;
+        double r = std::fmod(d, t);
+        return r > 0 ? r / t : 1.0;
+    };
+    double tile_eff_m              = ef(M, tm);
+    double tile_eff_n              = ef(N, tn);
+    double tile_eff_k              = ef(K, tk);
+    double overall_tile_efficiency = tile_eff_m * tile_eff_n * tile_eff_k;
+
+    // CU utilization
+    double cu_utilization = total_output_tiles / std::max((double)hw.num_cus, 1.0);
+
+    // P0 FIX: Problem-to-tile ratio features (critical for small problems)
+    double ratio_M_to_tile_m = M / std::max(tm, 1.0);
+    double ratio_N_to_tile_n = N / std::max(tn, 1.0);
+    double ratio_K_to_tile_k = K / std::max(tk, 1.0);
+
+    // Binary features: is problem dimension smaller than tile?
+    double problem_smaller_than_tile_m = (M < tm) ? 1.0 : 0.0;
+    double problem_smaller_than_tile_n = (N < tn) ? 1.0 : 0.0;
+    double problem_smaller_than_tile_k = (K < tk) ? 1.0 : 0.0;
+    double any_dim_too_small           = ((M < tm) || (N < tn) || (K < tk)) ? 1.0 : 0.0;
+
+    // P1 FIX: Padding requirement features
+    double needs_padding_m = (tm > 0 && std::fmod(M, tm) != 0.0) ? 1.0 : 0.0;
+    double needs_padding_n = (tn > 0 && std::fmod(N, tn) != 0.0) ? 1.0 : 0.0;
+    double needs_padding_k = (tk > 0 && std::fmod(K, tk) != 0.0) ? 1.0 : 0.0;
+
+    // Interaction features: kernel has padding when problem needs it
+    double has_padding_when_needed_m = (needs_padding_m && pad_m) ? 1.0 : 0.0;
+    double has_padding_when_needed_n = (needs_padding_n && pad_n) ? 1.0 : 0.0;
+    double has_padding_when_needed_k = (needs_padding_k && pad_k) ? 1.0 : 0.0;
+
+    // Critical feature: missing required padding (kernel will likely fail)
+    double missing_required_padding_m = (needs_padding_m && !pad_m) ? 1.0 : 0.0;
+    double missing_required_padding_n = (needs_padding_n && !pad_n) ? 1.0 : 0.0;
+    double missing_required_padding_k = (needs_padding_k && !pad_k) ? 1.0 : 0.0;
+    double missing_any_required_padding =
+        (missing_required_padding_m || missing_required_padding_n || missing_required_padding_k)
+            ? 1.0
+            : 0.0;
+
+    // Hardware features
+    double hw_num_cus          = (double)hw.num_cus;
+    double hw_simds_per_cu     = (double)hw.simds_per_cu;
+    double hw_total_simds      = (double)hw.total_simds();
+    double hw_shader_engines   = (double)hw.shader_engines;
+    double hw_max_clock_mhz    = (double)hw.max_clock_mhz;
+    double hw_max_waves_per_cu = (double)hw.max_waves_per_cu;
+    double hw_wavefront_size   = (double)hw.wavefront_size;
+    double hw_lds_capacity     = (double)hw.lds_capacity;
+    double hw_l1_cache_kb      = (double)hw.l1_cache_kb;
+    double hw_l2_cache_kb      = (double)hw.l2_cache_kb;
+    double hw_l3_cache_kb      = (double)hw.l3_cache_kb;
+    double hw_num_xcd          = (double)hw.num_xcd;
+
+    // Feature vector in EXACT order from feature_spec.json
+    // This order MUST match Python feature_engine.py::get_feature_names()
+    return {{
+        M,                            // 0
+        N,                            // 1
+        K,                            // 2
+        sk,                           // 3 (split_k)
+        l2M,                          // 4 (log2_M)
+        l2N,                          // 5 (log2_N)
+        l2K,                          // 6 (log2_K)
+        l2MNK,                        // 7 (log2_MNK)
+        ai,                           // 8 (arithmetic_intensity)
+        ar_mn,                        // 9 (aspect_ratio_mn)
+        ar_mk,                        // 10 (aspect_ratio_mk)
+        ar_nk,                        // 11 (aspect_ratio_nk)
+        layout,                       // 12 (layout)
+        tm,                           // 13 (tile_m)
+        tn,                           // 14 (tile_n)
+        tk,                           // 15 (tile_k)
+        wm,                           // 16 (warp_m)
+        wn,                           // 17 (warp_n)
+        wk,                           // 18 (warp_k)
+        wtm,                          // 19 (warp_tile_m)
+        wtn,                          // 20 (warp_tile_n)
+        wtk,                          // 21 (warp_tile_k)
+        pipeline,                     // 22 (pipeline)
+        scheduler,                    // 23 (scheduler)
+        epilogue,                     // 24 (epilogue)
+        pad_m,                        // 25 (pad_m)
+        pad_n,                        // 26 (pad_n)
+        pad_k,                        // 27 (pad_k)
+        persistent,                   // 28 (persistent)
+        num_warps,                    // 29 (num_warps)
+        tile_volume,                  // 30 (tile_volume)
+        tile_mn,                      // 31 (tile_mn)
+        lest,                         // 32 (lds_usage_estimate)
+        lds_ratio,                    // 33 (lds_usage_ratio)
+        ntm,                          // 34 (num_tiles_m)
+        ntn,                          // 35 (num_tiles_n)
+        ntk,                          // 36 (num_tiles_k)
+        total_output_tiles,           // 37 (total_output_tiles)
+        tile_eff_m,                   // 38 (tile_eff_m)
+        tile_eff_n,                   // 39 (tile_eff_n)
+        tile_eff_k,                   // 40 (tile_eff_k)
+        overall_tile_efficiency,      // 41 (overall_tile_efficiency)
+        cu_utilization,               // 42 (cu_utilization)
+        ratio_M_to_tile_m,            // 43 (ratio_M_to_tile_m)
+        ratio_N_to_tile_n,            // 44 (ratio_N_to_tile_n)
+        ratio_K_to_tile_k,            // 45 (ratio_K_to_tile_k)
+        problem_smaller_than_tile_m,  // 46 (problem_smaller_than_tile_m)
+        problem_smaller_than_tile_n,  // 47 (problem_smaller_than_tile_n)
+        problem_smaller_than_tile_k,  // 48 (problem_smaller_than_tile_k)
+        any_dim_too_small,            // 49 (any_dim_too_small)
+        needs_padding_m,              // 50 (needs_padding_m)
+        needs_padding_n,              // 51 (needs_padding_n)
+        needs_padding_k,              // 52 (needs_padding_k)
+        has_padding_when_needed_m,    // 53 (has_padding_when_needed_m)
+        has_padding_when_needed_n,    // 54 (has_padding_when_needed_n)
+        has_padding_when_needed_k,    // 55 (has_padding_when_needed_k)
+        missing_required_padding_m,   // 56 (missing_required_padding_m)
+        missing_required_padding_n,   // 57 (missing_required_padding_n)
+        missing_required_padding_k,   // 58 (missing_required_padding_k)
+        missing_any_required_padding, // 59 (missing_any_required_padding)
+        hw_num_cus,                   // 60 (hw_num_cus)
+        hw_simds_per_cu,              // 61 (hw_simds_per_cu)
+        hw_total_simds,               // 62 (hw_total_simds)
+        hw_shader_engines,            // 63 (hw_shader_engines)
+        hw_max_clock_mhz,             // 64 (hw_max_clock_mhz)
+        hw_max_waves_per_cu,          // 65 (hw_max_waves_per_cu)
+        hw_wavefront_size,            // 66 (hw_wavefront_size)
+        hw_lds_capacity,              // 67 (hw_lds_capacity)
+        hw_l1_cache_kb,               // 68 (hw_l1_cache_kb)
+        hw_l2_cache_kb,               // 69 (hw_l2_cache_kb)
+        hw_l3_cache_kb,               // 70 (hw_l3_cache_kb)
+        hw_num_xcd,                   // 71 (hw_num_xcd)
+    }};
+}
+
+class MLHeuristic
+{
+    public:
+    MLHeuristic(const std::string& path,
+                const Registry* reg,
+                HardwareProfile hw = {},
+                bool log_t         = false)
+        : registry_(reg), hw_(hw), log_t_(log_t)
+    {
+        int iters = 0;
+        if(LGBM_BoosterCreateFromModelfile(path.c_str(), &iters, &b_) != 0 || !b_)
+        {
+            std::cerr << "MLHeuristic: Failed to load " << path << std::endl;
+
+            // Check if a compressed .gz version exists
+            std::string gz_path = path + ".gz";
+            std::ifstream gz_check(gz_path);
+            if(gz_check.good())
+            {
+                std::cerr << "MLHeuristic: Found compressed model at " << gz_path << std::endl;
+                std::cerr << "MLHeuristic: Please decompress it first:" << std::endl;
+                std::cerr << "  gunzip " << gz_path << std::endl;
+            }
+
+            b_ = nullptr;
+        }
+        else
+            std::cout << "MLHeuristic: Loaded (" << iters << " iters)" << std::endl;
+    }
+    ~MLHeuristic()
+    {
+        if(b_)
+            LGBM_BoosterFree(b_);
+    }
+    MLHeuristic(const MLHeuristic&)            = delete;
+    MLHeuristic& operator=(const MLHeuristic&) = delete;
+    bool is_loaded() const { return b_ != nullptr; }
+    double predict_tflops(const Problem& prob, const KernelKey& key) const
+    {
+        if(!b_)
+            return 0;
+        auto f      = extract_features(prob, key, hw_);
+        int64_t ol  = 0;
+        double pred = 0;
+        if(LGBM_BoosterPredictForMat(
+               b_, f.data(), 0, 1, NUM_FEATURES, 1, 0, 0, 0, "", &ol, &pred) != 0)
+            return 0;
+        return log_t_ ? std::expm1(pred) : pred;
+    }
+    std::vector<std::string> operator()(const Problem& prob) const
+    {
+        if(!b_ || !registry_)
+            return {};
+        auto insts = registry_->get_all();
+        struct C
+        {
+            std::string id;
+            double t;
+        };
+        std::vector<C> cs;
+        cs.reserve(insts.size());
+        for(auto& i : insts)
+        {
+            auto& k = i->get_key();
+            cs.push_back({k.encode_identifier(), predict_tflops(prob, k)});
+        }
+        std::sort(cs.begin(), cs.end(), [](auto& a, auto& b) { return a.t > b.t; });
+        std::vector<std::string> r;
+        r.reserve(cs.size());
+        for(auto& c : cs)
+            r.push_back(std::move(c.id));
+        return r;
+    }
+
+    private:
+    void* b_                  = nullptr;
+    const Registry* registry_ = nullptr;
+    HardwareProfile hw_;
+    bool log_t_ = false;
+};
+} // namespace dispatcher
+} // namespace ck_tile
diff --git a/dispatcher/python/requirements.txt b/dispatcher/python/requirements.txt
index 9d429235f7..3ed0a13de8 100644
--- a/dispatcher/python/requirements.txt
+++ b/dispatcher/python/requirements.txt
@@ -1,11 +1,16 @@
 # Core dependencies
 numpy>=1.19.0
 
+# ML Heuristic dependencies (OPTIONAL - large dependencies)
+# For ML-based kernel selection, install separately:
+#   pip install -r ../requirements-ml.txt
+# This avoids mandatory large dependencies (pyarrow, lightgbm) for users who don't need ML features
+
 # Optional dependencies (install with pip install -e ".[torch]")
 # torch>=2.0.0
 
 # Development dependencies (install with pip install -e ".[dev]")
-# pytest>=6.0.0
+pytest>=6.0.0
 # pytest-cov>=2.0.0
 # black>=21.0
 # flake8>=3.9.0
diff --git a/dispatcher/requirements-ml.txt b/dispatcher/requirements-ml.txt
new file mode 100644
index 0000000000..68f60a3d91
--- /dev/null
+++ b/dispatcher/requirements-ml.txt
@@ -0,0 +1,6 @@
+# ML Heuristic dependencies for ML-based kernel selection
+# Install with: pip install -r requirements-ml.txt
+lightgbm>=3.0.0
+pandas>=1.3.0
+pyarrow>=6.0.0
+scikit-learn>=0.24.0
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index c6c0ca8da9..813667df0f 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -538,6 +538,7 @@ using fmha_bwd_dot_do_o_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdOGradDot
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::OGradDataType,
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::DDataType,
+    typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
     /* BlockSize = M0 = */ {F_bm0},
     {F_hdim},
     {F_mode},
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 1849068161..a5fffb5159 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -1209,7 +1209,8 @@ class KernelComponentFactoryGfx12(CompatibilityRuleFactory):
                 #                             bm0, bn0, bk0, bn1, bk1,
                 ( 32,  32) : [FmhaFwdTileSize( 64,  64,  16,  32,  32,   32,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                 ( 64,  64) : [FmhaFwdTileSize( 64,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
-                (128, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
+                (128, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("a.max_seqlen_q <= 8192")),
+                              FmhaFwdTileSize(128,  64,  32, 128,  32,  128,  8, 1, 1,  8, 1, 1,  16, 16, 16,  16, 16, 16,   6)],
                 (192, 128) : [FmhaFwdTileSize( 64,  64,  32, 128,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
                 (256, 256) : [FmhaFwdTileSize( 64,  64,  32, 256,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1)],
             }  # fmt: skip
@@ -1244,7 +1245,7 @@ class KernelComponentFactoryGfx12(CompatibilityRuleFactory):
                 ["t", "f"],
                 ["t", "f"],
             ):
-                pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
+                # pipelines.append(FmhaFwdPipeline("qr", "row", "f", "f", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
                 pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "f", "f", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
                 pipelines.append(FmhaFwdPipeline("qr", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, skip, "f", sink))  # fmt: skip
         elif dtype in cls._DT_FP8_FP8BF16 or dtype in cls._DT_FP8FP32:
@@ -1303,7 +1304,23 @@ class Product:
 
 def get_product(receipt: int) -> Product:
     # Flash attention integration
-    if receipt in (2, 3):
+    if receipt == 2:
+
+        def fit(problem_ctx: ProblemContext, kernel_ctx: KernelContext) -> bool:
+            cond = problem_ctx.dtype in ["fp16", "bf16"]
+            cond &= kernel_ctx.pipeline.F_vlayout == "row"
+            cond &= kernel_ctx.pipeline.F_bias in ["no", "alibi"]
+            cond &= kernel_ctx.pipeline.F_qscale == "no"
+            cond &= kernel_ctx.pipeline.F_skip == "f"
+            cond &= kernel_ctx.pipeline.F_sink == "f"
+            # FlashAttention direct fwd wrappers always use softcap disabled and LSE enabled.
+            cond &= kernel_ctx.pipeline.F_logits == "f"
+            cond &= kernel_ctx.pipeline.F_lse == "t"
+            return cond
+
+        return Product(name="Flash attention integration", rule=fit)
+    # Receipt 3 forward coverage used by CK library / smoke tests
+    elif receipt == 3:
 
         def fit(problem_ctx: ProblemContext, kernel_ctx: KernelContext) -> bool:
             cond = problem_ctx.dtype in ["fp16", "bf16"]
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index e0ccde8a6b..c9bac50da1 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -939,6 +939,8 @@ def get_fwd_splitkv_blobs(
                     cond = dtype in ["fp16", "bf16"]
                     cond &= pipeline.F_vlayout == "row"
                     cond &= pipeline.F_bias in ["no", "alibi"]
+                    # FlashAttention splitkv paths use softcap-disabled kernels only.
+                    cond &= pipeline.F_logits == "f"
                     cond &= pipeline.F_squant == "f"
                     cond &= pipeline.F_sink == "f"
                     if not cond:
@@ -1142,4 +1144,7 @@ def list_blobs(
         )
         for kernel in kernels:
             f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n")
-        f.write((file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME).as_posix() + "\n")
+        f.write(
+            (file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME).as_posix()
+            + "\n"
+        )
diff --git a/example/ck_tile/01_fmha/example_fmha_bwd.cpp b/example/ck_tile/01_fmha/example_fmha_bwd.cpp
index c1f3a4fce3..bec7da0a2f 100644
--- a/example/ck_tile/01_fmha/example_fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/example_fmha_bwd.cpp
@@ -87,6 +87,7 @@ auto create_args(int argc, char* argv[])
                 "0",
                 "if set to 1 will use multi-buffer reduction strategy for dq, atomic operation "
                 "will not be used")
+        .insert("sink_grad", "0", "if set to 1, compute and validate sink token gradient")
         .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
         .insert("jsonfile", "fmha_bwd.json", "json file name to dump results");
 
@@ -122,6 +123,7 @@ auto run(const ck_tile::ArgParser& arg_parser)
     bool deterministic       = arg_parser.get_bool("deterministic");
     std::string init_method  = arg_parser.get_str("init");
     uint32_t seed            = arg_parser.get_uint32("seed");
+    bool sink_grad           = arg_parser.get_bool("sink_grad");
 
     ck_tile::stream_config stream_config{nullptr,
                                          true,
@@ -154,6 +156,7 @@ auto run(const ck_tile::ArgParser& arg_parser)
                                         drop_offset,
                                         drop_prefs,
                                         mask_str,
+                                        sink_grad,
                                         deterministic,
                                         init_method,
                                         seed,
diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index 493de250e7..830ebe9c76 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -117,6 +117,9 @@ struct fmha_bwd_args
     void* dv_ptr;
     void* dbias_ptr;
     void* workspace_ptr;
+    const void*
+        sink_ptr; // sink scores [batch, nhead] in log-space (LSEDataType); nullptr disables sink
+    void* d_sink_ptr; // sink gradient output [nhead] (LSEDataType); nullptr disables sink gradient
 
     // Usage notes for sequence length pointer parameters:
     //
@@ -353,11 +356,15 @@ auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
             return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
                                                      args.do_ptr,
                                                      args.d_ptr,
+                                                     args.lse_ptr,
+                                                     args.sink_ptr,
+                                                     args.d_sink_ptr,
                                                      args.p_undrop,
                                                      args.seqstart_q_ptr,
                                                      args.seqlen_q_ptr,
                                                      args.cu_seqlen_q_ptr,
                                                      args.hdim_v,
+                                                     args.nhead_q,
                                                      args.stride_do,
                                                      args.stride_o,
                                                      args.nhead_stride_do,
@@ -369,9 +376,13 @@ auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
             return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
                                                      args.do_ptr,
                                                      args.d_ptr,
+                                                     args.lse_ptr,
+                                                     args.sink_ptr,
+                                                     args.d_sink_ptr,
                                                      args.p_undrop,
                                                      args.seqlen_q,
                                                      args.hdim_v,
+                                                     args.nhead_q,
                                                      args.stride_do,
                                                      args.stride_o,
                                                      args.nhead_stride_do,
diff --git a/example/ck_tile/01_fmha/fmha_bwd_runner.hpp b/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
index b4baa261d1..8871167c38 100644
--- a/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
@@ -77,6 +77,7 @@ bwd_result fmha_bwd_run(mode_enum mode,
                         uint64_t drop_offset,
                         bool drop_prefs,
                         std::string mask_str,
+                        bool sink_grad, // if true, compute and validate sink gradient
                         bool deterministic,
                         std::string init_method,
                         uint32_t seed,
@@ -285,6 +286,16 @@ bwd_result fmha_bwd_run(mode_enum mode,
         get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
     ck_tile::HostTensor<LSEDataType> lse_host(
         std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
+    ck_tile::HostTensor<LSEDataType> sink_host(
+        sink_grad ? std::array<ck_tile::index_t, 2>{shape_batch, nhead}
+                  : std::array<ck_tile::index_t, 2>{1, 1} /* dummy when sink is disabled */);
+    if(sink_grad)
+    {
+        std::uniform_real_distribution<float> sink_dist(30.0f, 60.0f);
+        sink_host.ForEach([&](auto& self, auto i) {
+            self(i) = static_cast<LSEDataType>(sink_dist(random_engine));
+        });
+    }
     ck_tile::HostTensor<DDataType> d_host(
         std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q});
     ck_tile::HostTensor<RandValOutputDataType> randval_host(
@@ -302,6 +313,12 @@ bwd_result fmha_bwd_run(mode_enum mode,
         use_dbias
             ? get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, max_seqlen_k)
             : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
+    ck_tile::HostTensor<LSEDataType> d_sink_host(sink_grad ? std::array<ck_tile::index_t, 1>{nhead}
+                                                           : std::array<ck_tile::index_t, 1>{0});
+    if(sink_grad)
+    {
+        d_sink_host.ForEach([&](auto& self, auto i) { self(i) = 0; });
+    }
 
     if(init_method == "ui" || init_method == "0")
     {
@@ -360,11 +377,13 @@ bwd_result fmha_bwd_run(mode_enum mode,
     ck_tile::DeviceMem bias_buf(bias_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem lse_buf(lse_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sink_buf(sink_grad ? sink_host.get_element_space_size_in_bytes() : 0);
     ck_tile::DeviceMem d_buf(d_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem randval_buf(randval_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem dq_buf(dq_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem dk_buf(dk_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem dv_buf(dv_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_sink_buf(sink_grad ? d_sink_host.get_element_space_size_in_bytes() : 0);
     ck_tile::DeviceMem do_buf(do_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem dbias_buf(dbias_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
@@ -396,6 +415,11 @@ bwd_result fmha_bwd_run(mode_enum mode,
     drop_seed_buf.ToDevice(drop_prefs ? &drop_seed : nullptr);
     drop_offset_buf.ToDevice(drop_prefs ? &drop_offset : nullptr);
     alibi_slope_buf.ToDevice(alibi_slope_host.data());
+    if(sink_grad)
+    {
+        sink_buf.ToDevice(sink_host.data());
+        d_sink_buf.ToDevice(d_sink_host.data());
+    }
 
     // clang-format off
     auto layout_str = [&](bool permute){
@@ -415,7 +439,8 @@ bwd_result fmha_bwd_run(mode_enum mode,
               << "] b:" << batch << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_qs[0]
               << "/" << seqlen_ks[0] << ", d:" << hdim_q << "/" << hdim_v << ", scale:" << scale
               << ", bias:" << bias << ", dbias:" << use_dbias << ", p_drop:" << p_drop
-              << ", s_randval:" << s_randval << ", deterministic:" << deterministic
+              << (sink_grad ? ", sink:(rand[30,60], grad)" : "") << ", s_randval:" << s_randval
+              << ", deterministic:" << deterministic
               << ", workspace:" << std::to_string(workspace_size_in_megabytes) << "MiB"
               << ", mask:" << mask << std::flush;
 
@@ -474,7 +499,6 @@ bwd_result fmha_bwd_run(mode_enum mode,
 
         const void* seqlen_q_ptr_dev = use_qpadding ? seqlen_q_dev.GetDeviceBuffer() : nullptr;
         const void* seqlen_k_ptr_dev = use_kpadding ? seqlen_k_dev.GetDeviceBuffer() : nullptr;
-
         return fmha_bwd_args{q_buf.GetDeviceBuffer(),
                              k_buf.GetDeviceBuffer(),
                              v_buf.GetDeviceBuffer(),
@@ -490,6 +514,8 @@ bwd_result fmha_bwd_run(mode_enum mode,
                              dv_buf.GetDeviceBuffer(),
                              dbias_buf.GetDeviceBuffer(),
                              ws_ptr,
+                             sink_buf.GetDeviceBuffer(),
+                             d_sink_buf.GetDeviceBuffer(),
                              seqstart_q.GetDeviceBuffer(),
                              seqstart_k.GetDeviceBuffer(),
                              seqlen_q_ptr_dev,
@@ -580,6 +606,7 @@ bwd_result fmha_bwd_run(mode_enum mode,
         std::vector<ck_tile::HostTensor<RandValOutputDataType>> randval_host_refs;
         std::vector<ck_tile::HostTensor<AccDataType>> p_hp_host_refs;
         std::vector<ck_tile::HostTensor<GemmDataType>> p_lp_host_refs;
+        std::vector<ck_tile::HostTensor<AccDataType>> p_sink_host_refs;
 
         randval_buf.FromDevice(randval_host.data());
 
@@ -756,6 +783,46 @@ bwd_result fmha_bwd_run(mode_enum mode,
             ck_tile::reference_batched_softmax<AccDataType, LSEDataType, AccDataType>(
                 s_host_ref, p_hp_host_ref, ck_tile::identity{}, lse_host_ref);
 
+            // Incorporate sink token into the softmax distribution (reference computation).
+            // The sink acts as an extra key whose score is sink_host(wb, i_h) (in log-space),
+            // which is a per-head random value in [30, 60].
+            //   lse_new = log(exp(lse_old) + exp(sink))
+            //   P_new   = P_old * exp(lse_old - lse_new)   (rescaled token attention)
+            //   P_sink  = exp(sink - lse_new)               (sink attention weight)
+            ck_tile::HostTensor<AccDataType> p_sink_host_ref(
+                sink_grad ? std::array<ck_tile::index_t, 2>{nhead, real_seqlen_q}
+                          : std::array<ck_tile::index_t, 2>{0, 0});
+            if(sink_grad)
+            {
+                for(int i_h = 0; i_h < nhead; ++i_h)
+                {
+                    AccDataType sink_val = sink_host(wb, i_h);
+                    for(int i_q = 0; i_q < real_seqlen_q; ++i_q)
+                    {
+                        // Use numerically stable log-sum-exp: lse_new = log(exp(lse_old)+exp(sink))
+                        //   = max(lse_old, sink) + log(1 + exp(min - max))
+                        // This handles lse_old = -inf (fully-masked rows) without producing NaN:
+                        //   if lse_old=-inf: max=sink, min=-inf, exp(-inf-sink)=0, lse_new=sink
+                        // It also avoids exp(lse_old) overflow when lse_old is large.
+                        //   p_scale = exp(lse_old - lse_new)   [fraction kept by regular tokens]
+                        //   p_sink  = exp(sink - lse_new)      [sink attention weight]
+                        AccDataType lse_old = lse_host_ref(i_h, i_q);
+                        AccDataType hi      = lse_old > sink_val ? lse_old : sink_val;
+                        AccDataType lo      = lse_old > sink_val ? sink_val : lse_old;
+                        AccDataType lse_new =
+                            hi + ck_tile::log(AccDataType(1) + ck_tile::exp(lo - hi));
+                        AccDataType p_scale = ck_tile::exp(lse_old - lse_new);
+
+                        lse_host_ref(i_h, i_q) = lse_new;
+
+                        for(int i_k = 0; i_k < real_seqlen_k; ++i_k)
+                            p_hp_host_ref(i_h, i_q, i_k) *= p_scale;
+
+                        p_sink_host_ref(i_h, i_q) = ck_tile::exp(sink_val - lse_new);
+                    }
+                }
+            }
+
             if(p_drop > 0)
             {
                 p_dropped_hp_host_ref = p_hp_host_ref;
@@ -814,6 +881,7 @@ bwd_result fmha_bwd_run(mode_enum mode,
             o_host_refs.push_back(o_host_ref);
             p_hp_host_refs.push_back(p_hp_host_ref);
             p_lp_host_refs.push_back(p_lp_host_ref);
+            p_sink_host_refs.push_back(p_sink_host_ref);
             if(p_drop > 0)
             {
                 randval_host_refs.push_back(randval_host_ref);
@@ -833,6 +901,8 @@ bwd_result fmha_bwd_run(mode_enum mode,
         o_buf.ToDevice(o_host.data());
         lse_buf.ToDevice(lse_host.data());
         dbias_buf.SetZero();
+        if(sink_grad)
+            d_sink_buf.SetZero();
 
         ck_tile::stream_config stream_config_v{nullptr, true, 0, 0, 1};
         launcher(fmha_args, stream_config_v);
@@ -841,10 +911,19 @@ bwd_result fmha_bwd_run(mode_enum mode,
         dk_buf.FromDevice(dk_host.data());
         dv_buf.FromDevice(dv_host.data());
         dbias_buf.FromDevice(dbias_host.data());
+        if(sink_grad)
+            d_sink_buf.FromDevice(d_sink_host.data());
 
         // Track the index into reference vectors (may differ from wb if batches were skipped)
         ck_tile::index_t ref_idx = 0;
 
+        // validation sink accumulator: global over batch, shape [nhead]
+        ck_tile::HostTensor<AccDataType> d_sink_host_ref(
+            sink_grad ? std::array<ck_tile::index_t, 1>{nhead}
+                      : std::array<ck_tile::index_t, 1>{0});
+        if(sink_grad)
+            d_sink_host_ref.ForEach([&](auto& self, auto i) { self(i) = 0; });
+
         for(ck_tile::index_t wb = 0; wb < batch; ++wb)
         {
             // When padding is enabled, use logical lengths instead of computing from padded
@@ -920,6 +999,30 @@ bwd_result fmha_bwd_run(mode_enum mode,
                 ds_hp_host_ref.mDesc.get_lengths()[1],
                 ds_hp_host_ref.mDesc.get_lengths()[2])(std::thread::hardware_concurrency());
 
+            if(sink_grad)
+            {
+                // Reference: dSink[h] = -sum_q( P_sink[h,q] * D[h,q] )
+                // where D[h,q] = sum_j(dO[h,q,j] * O[h,q,j]) * p_undrop
+                for(int i_h = 0; i_h < nhead; ++i_h)
+                {
+                    AccDataType d_sink_head_acc = 0;
+                    for(int i_q = 0; i_q < real_seqlen_q; ++i_q)
+                    {
+                        AccDataType do_dot_o = 0;
+                        for(int o = 0; o < hdim_v; o++)
+                        {
+                            do_dot_o +=
+                                ck_tile::type_convert<AccDataType>(do_host_ref(i_h, i_q, o)) *
+                                ck_tile::type_convert<AccDataType>(
+                                    o_host_refs[ref_idx](i_h, i_q, o)) *
+                                p_undrop;
+                        }
+                        d_sink_head_acc += -p_sink_host_refs[ref_idx](i_h, i_q) * do_dot_o;
+                    }
+                    d_sink_host_ref(i_h) += d_sink_head_acc;
+                }
+            }
+
             if(use_dbias)
             {
                 dbias_host_ref = ds_hp_host_ref.template CopyAsType<BiasGradDataType>();
@@ -1032,6 +1135,17 @@ bwd_result fmha_bwd_run(mode_enum mode,
             ref_idx++;
         }
 
+        if(pass && sink_grad)
+        {
+            auto [rtol, atol] = get_elimit<DataTypeConfig>(hdim_q, hdim_v);
+            bool dsink_pass   = ck_tile::check_err(d_sink_host,
+                                                 d_sink_host_ref,
+                                                 std::string("Error: SinkGrad Incorrect results!"),
+                                                 rtol,
+                                                 atol);
+            pass &= dsink_pass;
+        }
+
         std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh
index 456c3986fa..4fbde37cae 100755
--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
@@ -39,7 +39,6 @@ function print_log_header(){
 #run verification tests
 time example/ck_tile/01_fmha/script/smoke_test_fwd.sh
 time example/ck_tile/01_fmha/script/smoke_test_bwd.sh
-time example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
 
 #run performance benchmarks
 export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
diff --git a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
index 81617ee16c..c246ccb98f 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
@@ -69,6 +69,28 @@ test_h_s_mask -prec=fp16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=0 -operm=0
 test_h_s_mask -prec=bf16 -d=$hdim -bias=n -dbias=0 -p_drop=0   -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
 test_h_s_mask -prec=bf16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
 done
+
+# sink gradient tests: same coverage as main tests but with -sink_grad=1
+for prec in "fp16" "bf16" ; do
+for perm in 0 1 ; do
+for hdim in 64 128 256 ; do
+for mode in 0 1 ; do
+for bias in "n" "a" ; do
+for p_drop in 0.0 0.2 ; do
+test_h_s_mask -prec=$prec -d=$hdim -bias=$bias -dbias=0 -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=0 -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -sink_grad=1
+done
+done
+done
+done
+done
+done
+
+# sink gradient additional cases: non-standard hdim
+for hdim in 40 48 72 96 ; do
+test_h_s_mask -prec=fp16 -d=$hdim -bias=n -dbias=0 -p_drop=0   -iperm=0 -operm=0 -deterministic=0 -v=1 -mode=0 -kname=$KNAME $COMMON_ARGS -sink_grad=1
+test_h_s_mask -prec=fp16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS -sink_grad=1
+test_h_s_mask -prec=bf16 -d=$hdim -bias=n -dbias=0 -p_drop=0   -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS -sink_grad=1
+done
 set +x
 
 new_fails_count=0
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
index 227f26c8f3..1e9942a6e1 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -235,6 +235,64 @@ run_padding_basic_boundary_tests() {
     done
 }
 
+# Sink-specific mask pattern tests (sliding window + sink token).
+run_sink_mask_tests() {
+    # window_size[2,0], sink_size=2  (top-left causal + sink)
+    #    before:              after:
+    #    1 * * * * * * *      1 * * * * * * *
+    #    1 1 * * * * * *      1 1 * * * * * *
+    #    1 1 1 * * * * *      1 1 1 * * * * *
+    #    * 1 1 1 * * * *      1 1 1 1 * * * *
+    #    * * 1 1 1 * * *      1 1 1 1 1 * * *
+    #    * * * 1 1 1 * *      1 1 * 1 1 1 * *
+    #    * * * * 1 1 1 *      1 1 * * 1 1 1 *
+    #    * * * * * 1 1 1      1 1 * * * 1 1 1
+    run_exe -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=512   -s_k=512   -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=t:2,0,2
+    run_exe -prec=bf16 -mode=0 -b=2 -h=2 -d=128 -d_v=128 -s=512   -s_k=512   -bias=n -lse=0 -iperm=1 -operm=1 -vlayout=r -num_splits=1 -page_block_size=128   -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=t:2,0,2
+
+    # window_size[0,3], sink_size=2  (top-left + sink)
+    #    before:              after:
+    #    1 1 1 1 * * * *      1 1 1 1 * * * *
+    #    * 1 1 1 1 * * *      1 1 1 1 1 * * *
+    #    * * 1 1 1 1 * *      1 1 1 1 1 1 * *
+    #    * * * 1 1 1 1 *      1 1 * 1 1 1 1 *
+    #    * * * * 1 1 1 1      1 1 * * 1 1 1 1
+    run_exe -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=1024  -s_k=1024  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=t:0,3,2
+    run_exe -prec=bf16 -mode=1 -b=2 -h=2 -d=128 -d_v=128 -s=1024  -s_k=1024  -bias=n -lse=0 -iperm=1 -operm=1 -vlayout=r -num_splits=1 -page_block_size=128   -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=t:0,3,2
+
+    # window_size[1,0], sink_size=2  (bottom-right + sink)
+    #    before:              after:
+    #    * * 1 1 * * * *      1 1 1 1 * * * *
+    #    * * * 1 1 * * *      1 1 * 1 1 * * *
+    #    * * * * 1 1 * *      1 1 * * 1 1 * *
+    #    * * * * * 1 1 *      1 1 * * * 1 1 *
+    #    * * * * * * 1 1      1 1 * * * * 1 1
+    run_exe -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=4096  -s_k=4096  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:1,0,2
+    run_exe -prec=bf16 -mode=0 -b=2 -h=4 -d=128 -d_v=128 -s=2048  -s_k=2048  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:1,0,2
+
+    # window_size[2,0], sink_size=2  (bottom-right, group mode + sink)
+    run_exe -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=8192  -s_k=8192  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:2,0,2
+    run_exe -prec=bf16 -mode=1 -b=2 -h=2 -d=128 -d_v=128 -s=4096  -s_k=4096  -bias=n -lse=0 -iperm=1 -operm=1 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:2,0,2
+
+    # window_size[-1,1], sink_size=2  (bottom-right, large seqlen + sink)
+    run_exe -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=16384 -s_k=16384 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:-1,1,2
+    run_exe -prec=bf16 -mode=1 -b=1 -h=2 -d=128 -d_v=128 -s=8192  -s_k=8192  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS -mask=b:-1,1,2
+}
+
+# init_sink tests: validate sink token initialization across prec/hdim/mode.
+run_sink_init_tests() {
+    for prec in "fp16" "bf16" ; do
+    for hdim in 64 128 256 ; do
+    for mode in 0 1 ; do
+    for mask in 0 1 ; do
+        run_exe -prec=$prec -mode=$mode -b=1 -h=2 -d=$hdim -d_v=$hdim -s=512  -s_k=512  -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=$KNAME $COMMON_ARGS -init_sink=1 -mask=$mask
+        run_exe -prec=$prec -mode=$mode -b=2 -h=4 -d=$hdim -d_v=$hdim -s=1024 -s_k=1024 -bias=n -lse=0 -iperm=1 -operm=1 -vlayout=r -kname=$KNAME $COMMON_ARGS -init_sink=1 -mask=$mask
+    done
+    done
+    done
+    done
+}
+
 set -x
 
 run_fp16_bf16_tests
@@ -242,6 +300,8 @@ run_padding_smoke_tests
 run_padding_basic_boundary_tests
 run_fp8bf16_tests
 run_fp8fp32_tests
+run_sink_mask_tests
+run_sink_init_tests
 
 if [ $TEST_APPENDKV -eq 1 ] ; then
     run_fp16_appendkv_tests
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
deleted file mode 100755
index 5c9d3132b3..0000000000
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-# TODO: run this script from CK root or build directory
-#EXE="/code/composable_kernel/build/bin/tile_example_fmha_fwd"
-set -euo pipefail
-
-SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
-EXE_NAME=tile_example_fmha_fwd
-EXE="$(find . -name $EXE_NAME -type f | head -n 1)"
-KNAME=1
-GPU_arch=$GPU_arch
-if [ -z "$GPU_arch" ] ; then
-    GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}')
-fi
-set -x
-
-COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
-
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=512 -s_k=512 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=t:2,0,2
-
-# window_size[2,0], sink_size = 2
-
-#    x=1/y=3                 
-#    1 * * * * * * *           1 * * * * * * *  
-#    1 1 * * * * * *           1 1 * * * * * *
-#    1 1 1 * * * * *   ---->   1 1 1 * * * * * 
-#    * 1 1 1 * * * *           1 1 1 1 * * * * 
-#    * * 1 1 1 * * *           1 1 1 1 1 * * * 
-#    * * * 1 1 1 * *           1 1 * 1 1 1 * * 
-#    * * * * 1 1 1 *           1 1 * * 1 1 1 *
-#    * * * * * 1 1 1           1 1 * * * 1 1 1
-#    l=2/r=0(tl)               l=2/r=0/s=2(tl)
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=1024 -s_k=1024 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=t:0,3,2 #-mask=b:3,0,2
-
-#    x=4/y=1                   
-#    1 1 1 1 * * * *           1 1 1 1 * * * * 
-#    * 1 1 1 1 * * *           1 1 1 1 1 * * *
-#    * * 1 1 1 1 * *   ---->   1 1 1 1 1 1 * *
-#    * * * 1 1 1 1 *           1 1 * 1 1 1 1 *
-#    * * * * 1 1 1 1           1 1 * * 1 1 1 1 
-#    l=0/r=3(tl)               l=0/r=3/s=2(tl)
-#    l=3/r=0(br)               l=3/r=0/s=2(br)  
-
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=4096 -s_k=4096 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:1,0,2
-
-#    x=4/y=-1          
-#    * * 1 1 * * * *           1 1 1 1 * * * * 
-#    * * * 1 1 * * *           1 1 * 1 1 * * *
-#    * * * * 1 1 * *   ---->   1 1 * * 1 1 * *
-#    * * * * * 1 1 *           1 1 * * * 1 1 *
-#    * * * * * * 1 1           1 1 * * * * 1 1 
-#    l=1/r=0(br)               l=1/r=0/s=2(br)
-
-
-$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=8192 -s_k=8192 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:2,0,2
-
-#    x=-1/y=5 
-     
-#    * * * * * *               * * * * * *
-#    * * * * * *               * * * * * *
-#    1 * * * * *               1 * * * * *
-#    1 1 * * * *               1 1 * * * *
-#    1 1 1 * * *       ---->   1 1 1 * * *
-#    * 1 1 1 * *               1 1 1 1 * *
-#    * * 1 1 1 *               1 1 1 1 1 *  
-#    * * * 1 1 1               1 1 * 1 1 1
-#    l=2/r=0(br)               l=2/r=0/s=2(br)
-
-
-$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=16384 -s_k=16384 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:-1,1,2
-#      x=-1/y=8
-#    * * * * *               * * * * *    
-#    * * * * *               * * * * * 
-#    1 * * * *      ---->    1 * * * * 
-#    1 1 * * *               1 1 * * * 
-#    1 1 1 * *               1 1 1 * * 
-#    1 1 1 1 *               1 1 1 1 * 
-#    1 1 1 1 1               1 1 1 1 1 
-#    1 1 1 1 1               1 1 1 1 1 
-#    l=2/r=0(br)             l=2/r=0/s=2(br)
-     
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=512 -s_k=512 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1 -mask=1
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=1024 -s_k=1024 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1 -mask=0
-
-$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=4096 -s_k=4096 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1
-
-$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=8192 -s_k=8192 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1 -mask=1
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 879fb31ca5..7448c7a31a 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -63,8 +63,8 @@
 #define __gfx101__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
-    defined(__gfx10_3_generic__)
+    defined(__gfx1033__) || defined(__gfx1034__) || defined(__gfx1035__) || \
+    defined(__gfx1036__) || defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index 7191ad2c8a..97852531a9 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -125,8 +125,9 @@ inline bool is_gfx101_supported()
 inline bool is_gfx103_supported()
 {
     return ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx1031" ||
-           ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1034" ||
-           ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036";
+           ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1033" ||
+           ck::get_device_name() == "gfx1034" || ck::get_device_name() == "gfx1035" ||
+           ck::get_device_name() == "gfx1036";
 }
 
 inline bool is_wmma_supported()
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index 9b68c4de43..42bb8db613 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -16,6 +16,7 @@
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/library/utility/numeric.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -492,6 +493,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     c_grid_desc_m_n_container_.push_back(descs[I2]);
                 }
             }
+            c_space_size_bytes =
+                ck::accumulate_n<long_index_t>(
+                    input_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()) *
+                Conv_N_ * Conv_C_ * sizeof(CDataType);
         }
 
         const ADataType* p_a_grid_;
@@ -512,6 +517,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         std::vector<ck::index_t> conv_filter_dilations_;
         std::vector<ck::index_t> input_left_pads_;
         std::vector<ck::index_t> input_right_pads_;
+
+        long_index_t c_space_size_bytes;
     };
 
     // Invoker
@@ -571,18 +578,47 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                                 DeviceOp::BGridDesc_K0_N_K1,
                                                 DeviceOp::CGridDesc_M_N,
                                                 true>;
-
-                    ave_time += launch_and_time_kernel(stream_config,
-                                                       kernel,
-                                                       dim3(gdx, gdy, gdz),
-                                                       dim3(BlockSize),
-                                                       0,
-                                                       arg.p_a_grid_,
-                                                       arg.p_b_grid_,
-                                                       arg.p_c_grid_,
-                                                       arg.a_grid_desc_k0_m_k1_container_[i],
-                                                       arg.b_grid_desc_k0_n_k1_container_[i],
-                                                       arg.c_grid_desc_m_n_container_[i]);
+                    if(stream_config.flush_cache)
+                    {
+                        // Clear input only for perf measurement.
+                        // For non-grouped solver user has to clear input on his own.
+                        const auto clear_input = [&]() {
+                            if(i == 0)
+                            {
+                                hip_check_error(hipMemsetAsync(arg.p_c_grid_,
+                                                               0,
+                                                               arg.c_space_size_bytes,
+                                                               stream_config.stream_id_));
+                            }
+                        };
+                        ave_time += launch_and_time_kernel_with_preprocess_flush_cache(
+                            stream_config,
+                            clear_input,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            arg.p_a_grid_,
+                            arg.p_b_grid_,
+                            arg.p_c_grid_,
+                            arg.a_grid_desc_k0_m_k1_container_[i],
+                            arg.b_grid_desc_k0_n_k1_container_[i],
+                            arg.c_grid_desc_m_n_container_[i]);
+                    }
+                    else
+                    {
+                        ave_time += launch_and_time_kernel(stream_config,
+                                                           kernel,
+                                                           dim3(gdx, gdy, gdz),
+                                                           dim3(BlockSize),
+                                                           0,
+                                                           arg.p_a_grid_,
+                                                           arg.p_b_grid_,
+                                                           arg.p_c_grid_,
+                                                           arg.a_grid_desc_k0_m_k1_container_[i],
+                                                           arg.b_grid_desc_k0_n_k1_container_[i],
+                                                           arg.c_grid_desc_m_n_container_[i]);
+                    }
                 }
                 else
                 {
@@ -594,18 +630,47 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                                 DeviceOp::BGridDesc_K0_N_K1,
                                                 DeviceOp::CGridDesc_M_N,
                                                 false>;
-
-                    ave_time += launch_and_time_kernel(stream_config,
-                                                       kernel,
-                                                       dim3(gdx, gdy, gdz),
-                                                       dim3(BlockSize),
-                                                       0,
-                                                       arg.p_a_grid_,
-                                                       arg.p_b_grid_,
-                                                       arg.p_c_grid_,
-                                                       arg.a_grid_desc_k0_m_k1_container_[i],
-                                                       arg.b_grid_desc_k0_n_k1_container_[i],
-                                                       arg.c_grid_desc_m_n_container_[i]);
+                    if(stream_config.flush_cache)
+                    {
+                        // Clear input only for perf measurement.
+                        // For non-grouped solver user has to clear input on his own.
+                        const auto clear_input = [&]() {
+                            if(i == 0)
+                            {
+                                hip_check_error(hipMemsetAsync(arg.p_c_grid_,
+                                                               0,
+                                                               arg.c_space_size_bytes,
+                                                               stream_config.stream_id_));
+                            }
+                        };
+                        ave_time += launch_and_time_kernel_with_preprocess_flush_cache(
+                            stream_config,
+                            clear_input,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            arg.p_a_grid_,
+                            arg.p_b_grid_,
+                            arg.p_c_grid_,
+                            arg.a_grid_desc_k0_m_k1_container_[i],
+                            arg.b_grid_desc_k0_n_k1_container_[i],
+                            arg.c_grid_desc_m_n_container_[i]);
+                    }
+                    else
+                    {
+                        ave_time += launch_and_time_kernel(stream_config,
+                                                           kernel,
+                                                           dim3(gdx, gdy, gdz),
+                                                           dim3(BlockSize),
+                                                           0,
+                                                           arg.p_a_grid_,
+                                                           arg.p_b_grid_,
+                                                           arg.p_c_grid_,
+                                                           arg.a_grid_desc_k0_m_k1_container_[i],
+                                                           arg.b_grid_desc_k0_n_k1_container_[i],
+                                                           arg.c_grid_desc_m_n_container_[i]);
+                    }
                 }
             }
             return ave_time;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
index 022bda3ed0..ebddd90381 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
@@ -16,6 +16,7 @@
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/library/utility/numeric.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -1050,6 +1051,10 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
               input_right_pads_{input_right_pads}
         {
             CreateABCDesc<NDimSpatial>();
+            c_space_size_bytes =
+                ck::accumulate_n<long_index_t>(
+                    input_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()) *
+                Conv_N_ * Conv_C_ * sizeof(CDataType);
         }
 
         template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
@@ -1216,6 +1221,8 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
         std::vector<ck::index_t> conv_filter_dilations_;
         std::vector<ck::index_t> input_left_pads_;
         std::vector<ck::index_t> input_right_pads_;
+
+        long_index_t c_space_size_bytes;
     };
 
     // Invoker
@@ -1273,18 +1280,47 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                                                 DeviceOp::BGridDesc_K0_N_K1,
                                                 DeviceOp::CGridDesc_M_N,
                                                 true>;
-
-                    ave_time += launch_and_time_kernel(stream_config,
-                                                       kernel,
-                                                       dim3(gdx, gdy, gdz),
-                                                       dim3(BlockSize),
-                                                       0,
-                                                       arg.p_a_grid_,
-                                                       arg.p_b_grid_,
-                                                       arg.p_c_grid_,
-                                                       arg.a_grid_desc_k0_m_k1_container_[i],
-                                                       arg.b_grid_desc_k0_n_k1_container_[i],
-                                                       arg.c_grid_desc_m_n_container_[i]);
+                    if(stream_config.flush_cache)
+                    {
+                        // Clear input only for perf measurement.
+                        // For non-grouped solver user has to clear input on his own.
+                        const auto clear_input = [&]() {
+                            if(i == 0)
+                            {
+                                hip_check_error(hipMemsetAsync(arg.p_c_grid_,
+                                                               0,
+                                                               arg.c_space_size_bytes,
+                                                               stream_config.stream_id_));
+                            }
+                        };
+                        ave_time += launch_and_time_kernel_with_preprocess_flush_cache(
+                            stream_config,
+                            clear_input,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            arg.p_a_grid_,
+                            arg.p_b_grid_,
+                            arg.p_c_grid_,
+                            arg.a_grid_desc_k0_m_k1_container_[i],
+                            arg.b_grid_desc_k0_n_k1_container_[i],
+                            arg.c_grid_desc_m_n_container_[i]);
+                    }
+                    else
+                    {
+                        ave_time += launch_and_time_kernel(stream_config,
+                                                           kernel,
+                                                           dim3(gdx, gdy, gdz),
+                                                           dim3(BlockSize),
+                                                           0,
+                                                           arg.p_a_grid_,
+                                                           arg.p_b_grid_,
+                                                           arg.p_c_grid_,
+                                                           arg.a_grid_desc_k0_m_k1_container_[i],
+                                                           arg.b_grid_desc_k0_n_k1_container_[i],
+                                                           arg.c_grid_desc_m_n_container_[i]);
+                    }
                 }
                 else
                 {
@@ -1296,18 +1332,47 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
                                                 DeviceOp::BGridDesc_K0_N_K1,
                                                 DeviceOp::CGridDesc_M_N,
                                                 false>;
-
-                    ave_time += launch_and_time_kernel(stream_config,
-                                                       kernel,
-                                                       dim3(gdx, gdy, gdz),
-                                                       dim3(BlockSize),
-                                                       0,
-                                                       arg.p_a_grid_,
-                                                       arg.p_b_grid_,
-                                                       arg.p_c_grid_,
-                                                       arg.a_grid_desc_k0_m_k1_container_[i],
-                                                       arg.b_grid_desc_k0_n_k1_container_[i],
-                                                       arg.c_grid_desc_m_n_container_[i]);
+                    if(stream_config.flush_cache)
+                    {
+                        // Clear input only for perf measurement.
+                        // For non-grouped solver user has to clear input on his own.
+                        const auto clear_input = [&]() {
+                            if(i == 0)
+                            {
+                                hip_check_error(hipMemsetAsync(arg.p_c_grid_,
+                                                               0,
+                                                               arg.c_space_size_bytes,
+                                                               stream_config.stream_id_));
+                            }
+                        };
+                        ave_time += launch_and_time_kernel_with_preprocess_flush_cache(
+                            stream_config,
+                            clear_input,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            arg.p_a_grid_,
+                            arg.p_b_grid_,
+                            arg.p_c_grid_,
+                            arg.a_grid_desc_k0_m_k1_container_[i],
+                            arg.b_grid_desc_k0_n_k1_container_[i],
+                            arg.c_grid_desc_m_n_container_[i]);
+                    }
+                    else
+                    {
+                        ave_time += launch_and_time_kernel(stream_config,
+                                                           kernel,
+                                                           dim3(gdx, gdy, gdz),
+                                                           dim3(BlockSize),
+                                                           0,
+                                                           arg.p_a_grid_,
+                                                           arg.p_b_grid_,
+                                                           arg.p_c_grid_,
+                                                           arg.a_grid_desc_k0_m_k1_container_[i],
+                                                           arg.b_grid_desc_k0_n_k1_container_[i],
+                                                           arg.c_grid_desc_m_n_container_[i]);
+                    }
                 }
             }
             return ave_time;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 8b71a4fa40..825a3f8b5c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -1225,26 +1225,50 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                             has_main_loop,
                             no_main_loop,
                             CTranspose>;
-
-                        return launch_and_time_kernel_with_preprocess(
-                            stream_config,
-                            clear_workspace,
-                            kernel,
-                            dim3(gdx, gdy, gdz),
-                            dim3(BlockSize),
-                            0,
-                            p_b_grid,
-                            p_a_grid,
-                            arg.p_ds_grid_,
-                            p_e_grid,
-                            gemm_kernel_args,
-                            gemms_count_for_set,
-                            arg.b_element_op_,
-                            arg.a_element_op_,
-                            arg.cde_element_op_,
-                            arg.compute_ptr_offset_of_batch_,
-                            arg.compute_ptr_offset_of_n_,
-                            arg.k_batch_);
+                        if(stream_config.flush_cache)
+                        {
+                            return launch_and_time_kernel_with_preprocess_flush_cache(
+                                stream_config,
+                                clear_workspace,
+                                kernel,
+                                dim3(gdx, gdy, gdz),
+                                dim3(BlockSize),
+                                0,
+                                p_b_grid,
+                                p_a_grid,
+                                arg.p_ds_grid_,
+                                p_e_grid,
+                                gemm_kernel_args,
+                                gemms_count_for_set,
+                                arg.b_element_op_,
+                                arg.a_element_op_,
+                                arg.cde_element_op_,
+                                arg.compute_ptr_offset_of_batch_,
+                                arg.compute_ptr_offset_of_n_,
+                                arg.k_batch_);
+                        }
+                        else
+                        {
+                            return launch_and_time_kernel_with_preprocess(
+                                stream_config,
+                                clear_workspace,
+                                kernel,
+                                dim3(gdx, gdy, gdz),
+                                dim3(BlockSize),
+                                0,
+                                p_b_grid,
+                                p_a_grid,
+                                arg.p_ds_grid_,
+                                p_e_grid,
+                                gemm_kernel_args,
+                                gemms_count_for_set,
+                                arg.b_element_op_,
+                                arg.a_element_op_,
+                                arg.cde_element_op_,
+                                arg.compute_ptr_offset_of_batch_,
+                                arg.compute_ptr_offset_of_n_,
+                                arg.k_batch_);
+                        }
                     }
                     else
                     {
@@ -1264,26 +1288,50 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                             has_main_loop,
                             no_main_loop,
                             CTranspose>;
-
-                        return launch_and_time_kernel_with_preprocess(
-                            stream_config,
-                            clear_workspace,
-                            kernel,
-                            dim3(gdx, gdy, gdz),
-                            dim3(BlockSize),
-                            0,
-                            p_a_grid,
-                            p_b_grid,
-                            arg.p_ds_grid_,
-                            p_e_grid,
-                            gemm_kernel_args,
-                            gemms_count_for_set,
-                            arg.a_element_op_,
-                            arg.b_element_op_,
-                            arg.cde_element_op_,
-                            arg.compute_ptr_offset_of_batch_,
-                            arg.compute_ptr_offset_of_n_,
-                            arg.k_batch_);
+                        if(stream_config.flush_cache)
+                        {
+                            return launch_and_time_kernel_with_preprocess_flush_cache(
+                                stream_config,
+                                clear_workspace,
+                                kernel,
+                                dim3(gdx, gdy, gdz),
+                                dim3(BlockSize),
+                                0,
+                                p_a_grid,
+                                p_b_grid,
+                                arg.p_ds_grid_,
+                                p_e_grid,
+                                gemm_kernel_args,
+                                gemms_count_for_set,
+                                arg.a_element_op_,
+                                arg.b_element_op_,
+                                arg.cde_element_op_,
+                                arg.compute_ptr_offset_of_batch_,
+                                arg.compute_ptr_offset_of_n_,
+                                arg.k_batch_);
+                        }
+                        else
+                        {
+                            return launch_and_time_kernel_with_preprocess(
+                                stream_config,
+                                clear_workspace,
+                                kernel,
+                                dim3(gdx, gdy, gdz),
+                                dim3(BlockSize),
+                                0,
+                                p_a_grid,
+                                p_b_grid,
+                                arg.p_ds_grid_,
+                                p_e_grid,
+                                gemm_kernel_args,
+                                gemms_count_for_set,
+                                arg.a_element_op_,
+                                arg.b_element_op_,
+                                arg.cde_element_op_,
+                                arg.compute_ptr_offset_of_batch_,
+                                arg.compute_ptr_offset_of_n_,
+                                arg.k_batch_);
+                        }
                     }
                 };
                 if(has_loop_in_all_gemm)
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 62d7971a8a..0775b34eef 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -88,6 +88,7 @@ enum struct amdgcn_target_id
     GFX1030        = 0x1030,
     GFX1031        = 0x1031,
     GFX1032        = 0x1032,
+    GFX1033        = 0x1033,
     GFX1034        = 0x1034,
     GFX1035        = 0x1035,
     GFX1036        = 0x1036,
@@ -284,6 +285,7 @@ constexpr auto get_compiler_target()
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
+    MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1033, GFX1033);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
@@ -351,6 +353,7 @@ CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target_id(char const*
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1030", GFX1030);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1031", GFX1031);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1032", GFX1032);
+    MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1033", GFX1033);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1034", GFX1034);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1035", GFX1035);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1036", GFX1036);
@@ -607,6 +610,7 @@ CK_TILE_HOST_DEVICE constexpr auto get_compiler_target()
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
+    MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1033, GFX1033);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
     MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
@@ -688,6 +692,7 @@ CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target(char const* tes
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1030", GFX1030);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1031", GFX1031);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1032", GFX1032);
+    MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1033", GFX1033);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1034", GFX1034);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1035", GFX1035);
     MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1036", GFX1036);
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index a057ae9052..036e241c95 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -15,8 +15,8 @@
 #define __gfx101__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
-    defined(__gfx10_3_generic__)
+    defined(__gfx1033__) || defined(__gfx1034__) || defined(__gfx1035__) || \
+    defined(__gfx1036__) || defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
@@ -405,6 +405,12 @@ struct amdgcn_compiler_target_state
     static constexpr bool CK_TILE_ARCH_GFX1032 = false;
 #endif // __gfx1032__
 
+#if defined(__gfx1033__)
+    static constexpr bool CK_TILE_ARCH_GFX1033 = true;
+#else
+    static constexpr bool CK_TILE_ARCH_GFX1033 = false;
+#endif // __gfx1033__
+
 #if defined(__gfx1034__)
     static constexpr bool CK_TILE_ARCH_GFX1034 = true;
 #else
@@ -537,6 +543,7 @@ CK_TILE_HOST_DEVICE static constexpr uint32_t count_values_of(T search, Ts... se
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1030,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1031,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1032,         \
+        amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1033,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1034,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1035,         \
         amdgcn_compiler_target_state::CK_TILE_ARCH_GFX1036,         \
diff --git a/include/ck_tile/core/container/container_helper.hpp b/include/ck_tile/core/container/container_helper.hpp
index 90579c0034..699f0c8a65 100644
--- a/include/ck_tile/core/container/container_helper.hpp
+++ b/include/ck_tile/core/container/container_helper.hpp
@@ -39,7 +39,7 @@ CK_TILE_HOST_DEVICE constexpr auto
 container_reorder_given_new2old(const array<TData, NSize>& old_array, sequence<IRs...> /*new2old*/)
 {
     static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");
-    static_assert(is_valid_sequence_map<sequence<IRs...>>{}, "wrong! invalid reorder map");
+    static_assert(is_valid_sequence_map<sequence<IRs...>>::value, "wrong! invalid reorder map");
     return make_array<remove_cvref_t<TData>>(old_array[IRs]...);
 }
 
@@ -89,7 +89,7 @@ CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_new2old(const tuple<T
 {
     static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");
 
-    static_assert(is_valid_sequence_map<sequence<IRs...>>{}, "wrong! invalid reorder map");
+    static_assert(is_valid_sequence_map<sequence<IRs...>>::value, "wrong! invalid reorder map");
 
     return make_tuple(old_tuple[number<IRs>{}]...);
 }
@@ -109,7 +109,7 @@ CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_new2old(sequence<Is..
 {
     static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
 
-    static_assert(is_valid_sequence_map<sequence<IRs...>>{}, "wrong! invalid reorder map");
+    static_assert(is_valid_sequence_map<sequence<IRs...>>::value, "wrong! invalid reorder map");
 
     return sequence<sequence<Is...>::at(number<IRs>{})...>{};
 }
@@ -120,7 +120,7 @@ CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_old2new(sequence<Is..
 {
     static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
 
-    static_assert(is_valid_sequence_map<sequence<IRs...>>{}, "wrong! invalid reorder map");
+    static_assert(is_valid_sequence_map<sequence<IRs...>>::value, "wrong! invalid reorder map");
 
     constexpr auto new2old = typename sequence_map_inverse<sequence<IRs...>>::type{};
 
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index 35858bf75e..73ce09b20e 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -144,9 +144,11 @@ struct sequence
         static_assert(MapOld2New::size() == size(),
                       "wrong! reorder map should have the same size as sequence to be rerodered");
 
-        static_assert(is_valid_sequence_map<MapOld2New>::value, "wrong! invalid reorder map");
+        static_assert(is_valid_sequence_map<remove_cvref_t<MapOld2New>>::value,
+                      "wrong! invalid reorder map");
 
-        return reorder_new_to_old(typename sequence_map_inverse<MapOld2New>::type{});
+        return reorder_new_to_old(
+            typename sequence_map_inverse<remove_cvref_t<MapOld2New>>::type{});
     }
 
     CK_TILE_HOST_DEVICE static constexpr auto reverse()
@@ -548,163 +550,59 @@ struct sequence_reduce<Reduce, Seq>
 };
 #endif
 
-template <typename Values, typename Ids, typename Compare>
-struct sequence_sort_impl
+// Sorts a sequence using constexpr insertion sort. O(1) template instantiation
+// depth, replacing the recursive merge sort that created O(N log N) intermediate types.
+namespace detail {
+
+template <typename Values, typename Compare, typename IndexSeq>
+struct sequence_sort_helper;
+
+template <index_t... Vs, typename Compare, index_t... Idx>
+struct sequence_sort_helper<sequence<Vs...>, Compare, sequence<Idx...>>
 {
-    template <typename LeftValues,
-              typename LeftIds,
-              typename RightValues,
-              typename RightIds,
-              typename MergedValues,
-              typename MergedIds,
-              typename Comp>
-    struct sorted_sequence_merge_impl
+    struct sort_result
     {
-        static constexpr bool choose_left = LeftValues::front() < RightValues::front();
-
-        static constexpr index_t chosen_value =
-            choose_left ? LeftValues::front() : RightValues::front();
-        static constexpr index_t chosen_id = choose_left ? LeftIds::front() : RightIds::front();
-
-        using new_merged_values = decltype(MergedValues::push_back(number<chosen_value>{}));
-        using new_merged_ids    = decltype(MergedIds::push_back(number<chosen_id>{}));
-
-        using new_left_values = typename std::
-            conditional<choose_left, decltype(LeftValues::pop_front()), LeftValues>::type;
-        using new_left_ids =
-            typename std::conditional<choose_left, decltype(LeftIds::pop_front()), LeftIds>::type;
-
-        using new_right_values = typename std::
-            conditional<choose_left, RightValues, decltype(RightValues::pop_front())>::type;
-        using new_right_ids =
-            typename std::conditional<choose_left, RightIds, decltype(RightIds::pop_front())>::type;
-
-        using merge = sorted_sequence_merge_impl<new_left_values,
-                                                 new_left_ids,
-                                                 new_right_values,
-                                                 new_right_ids,
-                                                 new_merged_values,
-                                                 new_merged_ids,
-                                                 Comp>;
-        // this is output
-        using merged_values = typename merge::merged_values;
-        using merged_ids    = typename merge::merged_ids;
+        static_array<index_t, sizeof...(Vs)> values;
+        static_array<index_t, sizeof...(Vs)> ids;
     };
 
-    template <typename LeftValues,
-              typename LeftIds,
-              typename MergedValues,
-              typename MergedIds,
-              typename Comp>
-    struct sorted_sequence_merge_impl<LeftValues,
-                                      LeftIds,
-                                      sequence<>,
-                                      sequence<>,
-                                      MergedValues,
-                                      MergedIds,
-                                      Comp>
+    static constexpr sort_result compute()
     {
-        using merged_values = typename sequence_merge<MergedValues, LeftValues>::type;
-        using merged_ids    = typename sequence_merge<MergedIds, LeftIds>::type;
-    };
+        constexpr index_t n = sizeof...(Vs);
+        sort_result r{{{Vs...}}, {{Idx...}}};
+        // insertion sort — O(N^2) constexpr steps, O(1) template depth
+        for(index_t i = 1; i < n; ++i)
+        {
+            for(index_t j = i; j > 0 && Compare{}(r.values[j], r.values[j - 1]); --j)
+            {
+                auto tv         = r.values[j];
+                r.values[j]     = r.values[j - 1];
+                r.values[j - 1] = tv;
+                auto ti         = r.ids[j];
+                r.ids[j]        = r.ids[j - 1];
+                r.ids[j - 1]    = ti;
+            }
+        }
+        return r;
+    }
 
-    template <typename RightValues,
-              typename RightIds,
-              typename MergedValues,
-              typename MergedIds,
-              typename Comp>
-    struct sorted_sequence_merge_impl<sequence<>,
-                                      sequence<>,
-                                      RightValues,
-                                      RightIds,
-                                      MergedValues,
-                                      MergedIds,
-                                      Comp>
-    {
-        using merged_values = typename sequence_merge<MergedValues, RightValues>::type;
-        using merged_ids    = typename sequence_merge<MergedIds, RightIds>::type;
-    };
-
-    template <typename LeftValues,
-              typename LeftIds,
-              typename RightValues,
-              typename RightIds,
-              typename Comp>
-    struct sorted_sequence_merge
-    {
-        using merge = sorted_sequence_merge_impl<LeftValues,
-                                                 LeftIds,
-                                                 RightValues,
-                                                 RightIds,
-                                                 sequence<>,
-                                                 sequence<>,
-                                                 Comp>;
-
-        using merged_values = typename merge::merged_values;
-        using merged_ids    = typename merge::merged_ids;
-    };
-
-    static constexpr index_t nsize = Values::size();
-
-    using split_unsorted_values = sequence_split<Values, nsize / 2>;
-    using split_unsorted_ids    = sequence_split<Ids, nsize / 2>;
-
-    using left_unsorted_values = typename split_unsorted_values::left_type;
-    using left_unsorted_ids    = typename split_unsorted_ids::left_type;
-    using left_sort          = sequence_sort_impl<left_unsorted_values, left_unsorted_ids, Compare>;
-    using left_sorted_values = typename left_sort::sorted_values;
-    using left_sorted_ids    = typename left_sort::sorted_ids;
-
-    using right_unsorted_values = typename split_unsorted_values::right_type;
-    using right_unsorted_ids    = typename split_unsorted_ids::right_type;
-    using right_sort = sequence_sort_impl<right_unsorted_values, right_unsorted_ids, Compare>;
-    using right_sorted_values = typename right_sort::sorted_values;
-    using right_sorted_ids    = typename right_sort::sorted_ids;
-
-    using merged_sorted = sorted_sequence_merge<left_sorted_values,
-                                                left_sorted_ids,
-                                                right_sorted_values,
-                                                right_sorted_ids,
-                                                Compare>;
-
-    using sorted_values = typename merged_sorted::merged_values;
-    using sorted_ids    = typename merged_sorted::merged_ids;
+    static constexpr sort_result sorted = compute();
+    using sorted_values                 = sequence<sorted.values[Idx]...>;
+    using sorted_ids                    = sequence<sorted.ids[Idx]...>;
 };
 
-template <index_t ValueX, index_t ValueY, index_t IdX, index_t IdY, typename Compare>
-struct sequence_sort_impl<sequence<ValueX, ValueY>, sequence<IdX, IdY>, Compare>
-{
-    static constexpr bool choose_x = Compare{}(ValueX, ValueY);
-
-    using sorted_values = typename std::
-        conditional<choose_x, sequence<ValueX, ValueY>, sequence<ValueY, ValueX>>::type;
-    using sorted_ids =
-        typename std::conditional<choose_x, sequence<IdX, IdY>, sequence<IdY, IdX>>::type;
-};
-
-template <index_t Value, index_t Id, typename Compare>
-struct sequence_sort_impl<sequence<Value>, sequence<Id>, Compare>
-{
-    using sorted_values = sequence<Value>;
-    using sorted_ids    = sequence<Id>;
-};
-
-template <typename Compare>
-struct sequence_sort_impl<sequence<>, sequence<>, Compare>
-{
-    using sorted_values = sequence<>;
-    using sorted_ids    = sequence<>;
-};
+} // namespace detail
 
 template <typename Values, typename Compare>
 struct sequence_sort
 {
-    using unsorted_ids = typename arithmetic_sequence_gen<0, Values::size(), 1>::type;
-    using sort         = sequence_sort_impl<Values, unsorted_ids, Compare>;
+    static constexpr index_t n = Values::size();
+    using idx_seq              = make_index_sequence<n>;
 
-    // this is output
-    using type                = typename sort::sorted_values;
-    using sorted2unsorted_map = typename sort::sorted_ids;
+    using helper = detail::sequence_sort_helper<remove_cvref_t<Values>, Compare, idx_seq>;
+
+    using type                = typename helper::sorted_values;
+    using sorted2unsorted_map = typename helper::sorted_ids;
 };
 
 template <typename Values, typename Less, typename Equal>
@@ -782,10 +680,42 @@ struct sequence_unique_sort
     using sorted2unsorted_map = typename uniquify::uniquified_ids;
 };
 
+// Validates that a sequence is a permutation of {0, 1, ..., N-1}.
+// Uses a constexpr loop instead of instantiating sequence_sort.
+namespace detail {
+
+template <index_t... Is>
+constexpr bool check_valid_sequence_map()
+{
+    constexpr index_t n = sizeof...(Is);
+    if constexpr(n == 0)
+    {
+        return true;
+    }
+    else
+    {
+        constexpr index_t vals[] = {Is...};
+        static_array<bool, n> seen{};
+        for(index_t i = 0; i < n; ++i)
+        {
+            if(vals[i] < 0 || vals[i] >= n || seen[vals[i]])
+                return false;
+            seen[vals[i]] = true;
+        }
+        return true;
+    }
+}
+
+} // namespace detail
+
 template <typename SeqMap>
-struct is_valid_sequence_map
-    : std::is_same<typename arithmetic_sequence_gen<0, SeqMap::size(), 1>::type,
-                   typename sequence_sort<SeqMap, less<index_t>>::type>
+struct is_valid_sequence_map : std::false_type
+{
+};
+
+template <index_t... Is>
+struct is_valid_sequence_map<sequence<Is...>>
+    : std::integral_constant<bool, detail::check_valid_sequence_map<Is...>()>
 {
 };
 
diff --git a/include/ck_tile/core/tensor/tensor_adaptor.hpp b/include/ck_tile/core/tensor/tensor_adaptor.hpp
index e6cdb66ef9..56c62a29ee 100644
--- a/include/ck_tile/core/tensor/tensor_adaptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp
@@ -376,9 +376,10 @@ CK_TILE_HOST_DEVICE constexpr auto make_single_stage_tensor_adaptor(const Transf
     constexpr auto all_up_dim_new_top_ids = unpack(
         [](auto&&... xs) constexpr { return merge_sequences(xs...); }, UpperDimensionNewTopIdss{});
 
-    static_assert(is_valid_sequence_map<decltype(all_low_dim_old_top_ids)>::value &&
-                      is_valid_sequence_map<decltype(all_up_dim_new_top_ids)>::value,
-                  "wrong!");
+    static_assert(
+        is_valid_sequence_map<remove_cvref_t<decltype(all_low_dim_old_top_ids)>>::value &&
+            is_valid_sequence_map<remove_cvref_t<decltype(all_up_dim_new_top_ids)>>::value,
+        "wrong!");
 
     constexpr index_t ndim_old_top = all_low_dim_old_top_ids.size();
     constexpr index_t ndim_new_top = all_up_dim_new_top_ids.size();
@@ -443,8 +444,8 @@ transform_tensor_adaptor(const OldTensorAdaptor& old_tensor_adaptor,
         constexpr auto all_new_top_ids = unpack([](auto... xs) { return merge_sequences(xs...); },
                                                 NewUpperDimensionNewTopIdss{});
 
-        static_assert(is_valid_sequence_map<decltype(all_old_top_ids)>::value &&
-                          is_valid_sequence_map<decltype(all_new_top_ids)>::value,
+        static_assert(is_valid_sequence_map<remove_cvref_t<decltype(all_old_top_ids)>>::value &&
+                          is_valid_sequence_map<remove_cvref_t<decltype(all_new_top_ids)>>::value,
                       "wrong!");
     }
 
diff --git a/include/ck_tile/core/utility/functional.hpp b/include/ck_tile/core/utility/functional.hpp
index ae79d575a8..032be236b6 100644
--- a/include/ck_tile/core/utility/functional.hpp
+++ b/include/ck_tile/core/utility/functional.hpp
@@ -135,65 +135,147 @@ struct idx_identity
 
 namespace detail {
 
-// RemainLengths: sequence<...>
-// Orders: sequence<...>
-template <class RemainLengths, class Orders>
-struct static_ford_impl
+// Computes the inverse of a permutation as a constexpr array.
+// Avoids the sequence_map_inverse -> is_valid_sequence_map -> sequence_sort chain.
+template <class Perm>
+struct inverse_perm;
+
+template <index_t... Ps>
+struct inverse_perm<sequence<Ps...>>
 {
-    CK_TILE_HOST_DEVICE constexpr static_ford_impl()
+    static constexpr auto compute()
     {
-        static_assert(RemainLengths::size() > 0, "wrong! should not get here");
+        constexpr index_t n = sizeof...(Ps);
+        static_array<index_t, n> result{};
+        constexpr index_t input[] = {Ps...};
+        for(index_t i = 0; i < n; ++i)
+        {
+            result[input[i]] = i;
+        }
+        return result;
+    }
+    static constexpr auto value = compute();
+};
+
+// Decomposes a linear index into multi-dimensional indices using pre-computed
+// strides. Uses a single flat static_for instead of recursive nesting, which
+// eliminates intermediate lambda closure instantiations.
+template <class OrderedLengths, class IndexSeq>
+struct index_decomposer;
+
+template <index_t... Ls, index_t... Is>
+struct index_decomposer<sequence<Ls...>, sequence<Is...>>
+{
+    static constexpr index_t n_dim                        = sizeof...(Ls);
+    static constexpr static_array<index_t, n_dim> lengths = {{Ls...}};
+
+    static constexpr static_array<index_t, n_dim> compute_all_strides()
+    {
+        static_array<index_t, n_dim> result{};
+        if constexpr(n_dim > 0)
+        {
+            result[n_dim - 1] = 1;
+            for(index_t i = n_dim - 1; i > 0; --i)
+            {
+                result[i - 1] = result[i] * lengths[i];
+            }
+        }
+        return result;
     }
 
-    // F signature: F(sequence<...>)
-    // CurrentOrderedId: sequence<...>
-    template <class F, class CurrentOrderedId>
-    CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentOrderedId) const
+    static constexpr static_array<index_t, n_dim> strides = compute_all_strides();
+
+    // Compile-time decomposition: linear index -> sequence of per-dimension indices
+    template <index_t LinearIdx>
+    using decompose = sequence<((LinearIdx / strides[Is]) % lengths[Is])...>;
+
+    // Decompose AND reorder in one step using a pre-computed inverse permutation.
+    // Produces the unordered multi-index directly, avoiding per-iteration
+    // reorder_old_to_new member function instantiations on each unique sequence type.
+    template <index_t LinearIdx, class New2Old>
+    using decompose_reordered = sequence<((LinearIdx / strides[inverse_perm<New2Old>::value[Is]]) %
+                                          lengths[inverse_perm<New2Old>::value[Is]])...>;
+};
+
+// Calls f(decompose<I>{}) for each linear index I in the pack, using a single
+// fold expression. Bypasses the static_for lambda entirely, eliminating M*N
+// intermediate lambda closure instantiations that the lambda-based approach creates.
+template <class Decomposer, class LinearIdxSeq>
+struct ford_applier;
+
+template <class Decomposer, index_t... LinearIds>
+struct ford_applier<Decomposer, sequence<LinearIds...>>
+{
+    template <class F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
     {
-        static_for<0, RemainLengths::front(), 1>{}([=](auto I) {
-            static_ford_impl<decltype(RemainLengths::pop_front()), Orders>{}(
-                f, CurrentOrderedId::push_back(I));
-        });
+        if constexpr(sizeof...(LinearIds) > 0)
+        {
+            (f(typename Decomposer::template decompose<LinearIds>{}), ...);
+        }
     }
 };
 
-template <class Orders>
-struct static_ford_impl<sequence<>, Orders>
+// Same as ford_applier but applies reordering during decomposition.
+template <class Decomposer, class New2Old, class LinearIdxSeq>
+struct ford_applier_reordered;
+
+template <class Decomposer, class New2Old, index_t... LinearIds>
+struct ford_applier_reordered<Decomposer, New2Old, sequence<LinearIds...>>
 {
-    // F signature: F(sequence<...>)
-    // OrderedId: sequence<...>
-    template <class F, class OrderedId>
-    CK_TILE_HOST_DEVICE constexpr void operator()(F f, OrderedId) const
+    template <class F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
     {
-        // retrive unordered Id
-        f(OrderedId::reorder_old_to_new(Orders{}));
+        if constexpr(sizeof...(LinearIds) > 0)
+        {
+            (f(typename Decomposer::template decompose_reordered<LinearIds, New2Old>{}), ...);
+        }
     }
 };
 
 } // namespace detail
 
-// Lengths is sequence<...>, it is the length of each dimension for
-// N-dimensional loop
-// Orders is sequence<...>, it is the order of dimension in which static_ford
-// will loop over each
-// dimension
+// Compile-time N-dimensional loop with static multi-indices.
+// Uses direct fold expansion with index decomposition, producing zero
+// intermediate lambda closures. Each iteration calls f with a compile-time
+// sequence<i0, i1, ...> containing the multi-dimensional index.
 template <class Lengths,
           class Orders = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
 struct static_ford
 {
+    static constexpr index_t n_dim = Lengths::size();
+    static constexpr index_t total_size =
+        reduce_on_sequence(Lengths{}, multiplies<>{}, number<1>{});
+
+    static constexpr bool is_identity_order = std::is_same_v<Orders, make_index_sequence<n_dim>>;
+
+    // For identity order, OrderedLengths == Lengths (no reorder needed).
+    // For non-identity, reorder lengths according to iteration order.
+    // Both branches must be valid types, but only the active one is used.
+    using OrderedLengths =
+        std::conditional_t<is_identity_order,
+                           Lengths,
+                           remove_cvref_t<decltype(Lengths::reorder_new_to_old(Orders{}))>>;
+    using Decomposer = detail::index_decomposer<OrderedLengths, make_index_sequence<n_dim>>;
+
     CK_TILE_HOST_DEVICE constexpr static_ford()
     {
         static_assert(Lengths::size() > 0, "wrong! Lengths is empty");
         static_assert(Lengths::size() == Orders::size(), "wrong! inconsistent size");
     }
 
-    // F signature: F(sequence<...> multi_id)
-    // multi_id is the unordered multi-index
     template <class F>
     CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
     {
-        constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{});
-        detail::static_ford_impl<decltype(ordered_lengths), Orders>{}(f, sequence<>{});
+        if constexpr(is_identity_order)
+        {
+            detail::ford_applier<Decomposer, make_index_sequence<total_size>>{}(f);
+        }
+        else
+        {
+            detail::ford_applier_reordered<Decomposer, Orders, make_index_sequence<total_size>>{}(
+                f);
+        }
     }
 };
 
diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp
index c73f76dd39..d02e327457 100644
--- a/include/ck_tile/core/utility/reduce_operator.hpp
+++ b/include/ck_tile/core/utility/reduce_operator.hpp
@@ -103,6 +103,42 @@ struct Max
     }
 };
 
+struct Min
+{
+    template <
+        typename T,
+        typename = std::enable_if_t<
+            is_any_of<T, float, double, int32_t, int8_t, half_t, bf16_t, fp8_t, bf8_t>::value>>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return numeric<T>::max();
+    };
+
+    template <
+        typename T,
+        typename = std::enable_if_t<
+            is_any_of<T, float, double, int32_t, int8_t, half_t, bf16_t, fp8_t, bf8_t>::value>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return min(y, x);
+    }
+
+    // Overload with changed flag for index tracking
+    template <
+        typename T,
+        typename = std::enable_if_t<
+            is_any_of<T, float, double, int32_t, int8_t, half_t, bf16_t, fp8_t, bf8_t>::value>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x, bool& changed) const
+    {
+        T new_min = min(y, x);
+        if(x < y)
+        {
+            changed = true;
+        }
+        return new_min;
+    }
+};
+
 struct AbsMax
 {
     template <
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 5d1ac2fd2f..fba831e205 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -129,7 +129,13 @@ struct CShuffleEpilogue
     static constexpr index_t isCTransposed = Problem::isCTransposed;
     static constexpr bool FixedVectorSize  = Problem::FixedVectorSize;
     static constexpr bool TiledMMAPermuteN = Problem::TiledMMAPermuteN;
-    static constexpr bool EightWave        = (MWave * NWave == 8);
+
+#if defined(__gfx9__)
+    static constexpr bool EightWave = (MWave * NWave == 8);
+#else
+    static constexpr bool EightWave = false;
+#endif
+
     static constexpr index_t BlockedXDLN_PerWarp =
         EightWave ? kNPerBlock / NWave / NPerXdl : Problem::BlockedXDLN_PerWarp;
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index 0a11102992..d7ed58f58a 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -1516,6 +1516,7 @@ struct FmhaBwdOGradDotOKernel
     using DDataType     = ck_tile::remove_cvref_t<typename FmhaBwdOGradDotO::DDataType>;
     using ODataType     = ck_tile::remove_cvref_t<typename FmhaBwdOGradDotO::ODataType>;
     using OGradDataType = ck_tile::remove_cvref_t<typename FmhaBwdOGradDotO::OGradDataType>;
+    using LSEDataType   = ck_tile::remove_cvref_t<typename FmhaBwdOGradDotO::LSEDataType>;
 
     static constexpr bool kIsGroupMode = FmhaBwdOGradDotO::kIsGroupMode;
     static constexpr bool kPadSeqLenQ  = FmhaBwdOGradDotO::kPadSeqLenQ;
@@ -1557,25 +1558,31 @@ struct FmhaBwdOGradDotOKernel
         const void* o_ptr;
         const void* do_ptr;
         void* d_ptr;
+        const void* lse_ptr; // log-sum-exp from forward pass, shape [batch, nhead, seqlen_q]
+        const LSEDataType* sink_ptr; // sink scores, shape [batch, nhead]; nullptr disables sink
+        LSEDataType* d_sink_ptr; // sink gradient output, shape [nhead]; nullptr disables sink grad
 
         float p_undrop;
 
         ck_tile::index_t seqlen_q;
         ck_tile::index_t hdim_v;
+        ck_tile::index_t nhead; // used to index sink_ptr / d_sink_ptr
 
         ck_tile::index_t stride_do;
         ck_tile::index_t stride_o;
 
         ck_tile::index_t nhead_stride_do;
         ck_tile::index_t nhead_stride_o;
-        ck_tile::index_t nhead_stride_d;
+        // LSE and D always share the same layout; this stride covers both.
+        ck_tile::index_t nhead_stride_lsed;
     };
 
     struct FmhaBwdOGradDotOBatchModeKargs : FmhaBwdOGradDotOCommonKargs
     {
         ck_tile::index_t batch_stride_do;
         ck_tile::index_t batch_stride_o;
-        ck_tile::index_t batch_stride_d;
+        // LSE and D always share the same layout; this stride covers both.
+        ck_tile::index_t batch_stride_lsed;
     };
 
     struct FmhaBwdOGradDotOGroupModeKargs : FmhaBwdOGradDotOCommonKargs
@@ -1593,32 +1600,40 @@ struct FmhaBwdOGradDotOKernel
     MakeKargs(const void* o_ptr,
               const void* do_ptr,
               void* d_ptr,
+              const void* lse_ptr,
+              const void* sink_ptr,
+              void* d_sink_ptr,
               float p_undrop,
               ck_tile::index_t seqlen_q,
               ck_tile::index_t hdim_v,
+              ck_tile::index_t nhead,
               ck_tile::index_t stride_do,
               ck_tile::index_t stride_o,
               ck_tile::index_t nhead_stride_do,
               ck_tile::index_t nhead_stride_o,
-              ck_tile::index_t nhead_stride_d,
+              ck_tile::index_t nhead_stride_lsed,
               ck_tile::index_t batch_stride_do,
               ck_tile::index_t batch_stride_o,
-              ck_tile::index_t batch_stride_d)
+              ck_tile::index_t batch_stride_lsed)
     {
         Kargs kargs{{o_ptr,
                      do_ptr,
                      d_ptr,
+                     lse_ptr,
+                     reinterpret_cast<const LSEDataType*>(sink_ptr),
+                     reinterpret_cast<LSEDataType*>(d_sink_ptr),
                      p_undrop,
                      seqlen_q,
                      hdim_v,
+                     nhead,
                      stride_do,
                      stride_o,
                      nhead_stride_do,
                      nhead_stride_o,
-                     nhead_stride_d},
+                     nhead_stride_lsed},
                     batch_stride_do,
                     batch_stride_o,
-                    batch_stride_d};
+                    batch_stride_lsed};
 
         return kargs;
     }
@@ -1628,28 +1643,36 @@ struct FmhaBwdOGradDotOKernel
     MakeKargs(const void* o_ptr,
               const void* do_ptr,
               void* d_ptr,
+              const void* lse_ptr,
+              const void* sink_ptr,
+              void* d_sink_ptr,
               float p_undrop,
               const void* seqstart_q_ptr,
               const void* seqlen_q_ptr,
               const void* cu_seqlen_q_ptr,
               ck_tile::index_t hdim_v,
+              ck_tile::index_t nhead,
               ck_tile::index_t stride_do,
               ck_tile::index_t stride_o,
               ck_tile::index_t nhead_stride_do,
               ck_tile::index_t nhead_stride_o,
-              ck_tile::index_t nhead_stride_d)
+              ck_tile::index_t nhead_stride_lsed)
     {
         Kargs kargs{{o_ptr,
                      do_ptr,
                      d_ptr,
+                     lse_ptr,
+                     reinterpret_cast<const LSEDataType*>(sink_ptr),
+                     reinterpret_cast<LSEDataType*>(d_sink_ptr),
                      p_undrop,
                      -1, // seqlen will be updated by another pointer
                      hdim_v,
+                     nhead,
                      stride_do,
                      stride_o,
                      nhead_stride_do,
                      nhead_stride_o,
-                     nhead_stride_d},
+                     nhead_stride_lsed},
                     reinterpret_cast<const int32_t*>(seqstart_q_ptr),
                     reinterpret_cast<const int32_t*>(seqlen_q_ptr),
                     reinterpret_cast<const int32_t*>(cu_seqlen_q_ptr)};
@@ -1683,18 +1706,18 @@ struct FmhaBwdOGradDotOKernel
 
         const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
 
-        long_index_t batch_offset_o  = 0;
-        long_index_t batch_offset_do = 0;
-        long_index_t batch_offset_d  = 0;
+        long_index_t batch_offset_o    = 0;
+        long_index_t batch_offset_do   = 0;
+        long_index_t batch_offset_lsed = 0;
 
         if constexpr(kIsGroupMode)
         {
             // get starting offset for each batch
             const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
 
-            batch_offset_o  = query_start * kargs.stride_o;
-            batch_offset_do = query_start * kargs.stride_do;
-            batch_offset_d  = query_start;
+            batch_offset_o    = query_start * kargs.stride_o;
+            batch_offset_do   = query_start * kargs.stride_do;
+            batch_offset_lsed = query_start;
 
             // Priority: cu_seqlen_q_ptr > seqlen_q_ptr > physical_seqlen_q
             if(kargs.cu_seqlen_q_ptr != nullptr)
@@ -1722,11 +1745,20 @@ struct FmhaBwdOGradDotOKernel
         }
         else
         {
-            batch_offset_o  = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
-            batch_offset_do = static_cast<long_index_t>(i_batch) * kargs.batch_stride_do;
-            batch_offset_d  = static_cast<long_index_t>(i_batch) * kargs.batch_stride_d;
+            batch_offset_o    = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+            batch_offset_do   = static_cast<long_index_t>(i_batch) * kargs.batch_stride_do;
+            batch_offset_lsed = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lsed;
         }
 
+        // Read per-head sink score and convert to log2 domain so the pipeline can use exp2.
+        // Pre-multiply by log2e so that exp2(sink_value - log2e*lse) == exp(raw_sink - lse).
+        // -inf is left unchanged (log2e * -inf == -inf) to keep P_sink -> 0 when sink is disabled.
+        const LSEDataType sink_value =
+            kargs.sink_ptr != nullptr
+                ? log2e_v<LSEDataType> *
+                      kargs.sink_ptr[static_cast<long_index_t>(i_batch) * kargs.nhead + i_nhead]
+                : -numeric<LSEDataType>::infinity();
+
         // for simplicity, batch stride we just modify the pointer
         const ODataType* o_ptr = reinterpret_cast<const ODataType*>(kargs.o_ptr) +
                                  static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
@@ -1734,9 +1766,13 @@ struct FmhaBwdOGradDotOKernel
         const OGradDataType* do_ptr = reinterpret_cast<const OGradDataType*>(kargs.do_ptr) +
                                       static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_do +
                                       batch_offset_do;
+        const LSEDataType* lse_ptr = reinterpret_cast<const LSEDataType*>(kargs.lse_ptr) +
+                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_lsed +
+                                     batch_offset_lsed;
+
         DDataType* d_ptr = reinterpret_cast<DDataType*>(kargs.d_ptr) +
-                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_d +
-                           batch_offset_d;
+                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_lsed +
+                           batch_offset_lsed;
 
         // O/dO/D DRAM and DRAM window
         const auto o_dram = [&]() {
@@ -1770,13 +1806,31 @@ struct FmhaBwdOGradDotOKernel
 
         auto o_dram_window =
             make_tile_window(o_dram, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {i_m0, 0});
-
         auto do_dram_window =
             make_tile_window(do_dram, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {i_m0, 0});
-
         auto d_dram_window = make_tile_window(d_dram, make_tuple(number<kM0>{}), {i_m0});
 
-        FmhaBwdOGradDotO{}(o_dram_window, do_dram_window, d_dram_window, kargs.p_undrop);
+        // nullptr when sink grad is disabled; the pipeline checks this to skip the sink path
+        LSEDataType* atomic_sink_grad_ptr =
+            kargs.d_sink_ptr == nullptr ? nullptr : kargs.d_sink_ptr + i_nhead;
+
+        // lse_ptr is always valid (also needed by the main bwd kernel).
+        // The actual load happens inside the pipeline only when atomic_sink_grad_ptr != nullptr.
+        auto lse_dram = [&]() {
+            const auto lse_dram_naive = make_naive_tensor_view_packed<address_space_enum::global>(
+                lse_ptr, make_tuple(kargs.seqlen_q), number<1>{});
+            return pad_tensor_view(
+                lse_dram_naive, make_tuple(number<kM0>{}), sequence<kPadSeqLenQ>{});
+        }();
+        auto lse_dram_window = make_tile_window(lse_dram, make_tuple(number<kM0>{}), {i_m0});
+
+        FmhaBwdOGradDotO{}(o_dram_window,
+                           do_dram_window,
+                           lse_dram_window,
+                           d_dram_window,
+                           sink_value,
+                           kargs.p_undrop,
+                           atomic_sink_grad_ptr);
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp
index f01d681002..1cc40fdaa9 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp
@@ -14,6 +14,7 @@ struct BlockFmhaBwdOGradDotO
     using ODataType     = remove_cvref_t<typename Problem::ODataType>;
     using OGradDataType = remove_cvref_t<typename Problem::OGradDataType>;
     using DDataType     = remove_cvref_t<typename Problem::DDataType>;
+    using LSEDataType   = remove_cvref_t<typename Problem::LSEDataType>; // needed for sink gradient
 
     static constexpr index_t kBlockPerCu = Problem::kBlockPerCu;
     static constexpr index_t kBlockSize  = Problem::kBlockSize;
@@ -32,11 +33,18 @@ struct BlockFmhaBwdOGradDotO
 
     template <typename ODramBlockWindowTmp,
               typename OGradDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
               typename DDramBlockWindowTmp>
+    // Computes D = diag(dO * O) and optionally accumulates the sink token gradient.
+    // sink_value: log-space sink score; pass -inf and atomic_sink_grad_ptr=nullptr to skip sink.
+    // atomic_sink_grad_ptr: per-head accumulator in global memory; nullptr disables sink path.
     CK_TILE_HOST_DEVICE void operator()(const ODramBlockWindowTmp& o_dram_block_window_tmp,
                                         const OGradDramBlockWindowTmp& do_dram_block_window_tmp,
+                                        const LSEDramBlockWindowTmp& lse_dram_block_window_tmp,
                                         DDramBlockWindowTmp& d_dram_block_window_tmp,
-                                        float p_undrop) const
+                                        const LSEDataType sink_value,
+                                        float p_undrop,
+                                        LSEDataType* atomic_sink_grad_ptr = nullptr) const
     {
         static_assert(
             std::is_same_v<ODataType, remove_cvref_t<typename ODramBlockWindowTmp::DataType>> &&
@@ -44,6 +52,10 @@ struct BlockFmhaBwdOGradDotO
                                remove_cvref_t<typename OGradDramBlockWindowTmp::DataType>> &&
                 std::is_same_v<DDataType, remove_cvref_t<typename DDramBlockWindowTmp::DataType>>,
             "wrong!");
+        // atomic_sink_grad_ptr is reinterpret_cast to float* in the sink path;
+        // ensure LSEDataType is float so the cast is well-defined.
+        static_assert(std::is_same_v<LSEDataType, float>,
+                      "sink gradient atomicAdd requires LSEDataType == float");
 
         static_assert(kBlockSize == ODramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
                           kBlockSize ==
@@ -67,14 +79,13 @@ struct BlockFmhaBwdOGradDotO
 
         auto do_ = load_tile(do_dram_window);
 
-        // declare d
+        // D[q] = sum_j(O[q,j] * dO[q,j]), used in softmax backward
         constexpr auto d_dstr =
             make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding(
                 o.get_tile_distribution().get_static_tile_distribution_encoding(), sequence<1>{}));
 
         auto d = make_static_distributed_tensor<DDataType>(d_dstr);
-
-        clear_tile(d); // Initialize D
+        clear_tile(d);
 
         constexpr auto o_spans = decltype(o)::get_distributed_spans();
         sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
@@ -86,9 +97,67 @@ struct BlockFmhaBwdOGradDotO
             });
         });
 
+        // Scale by p_undrop (=1 when dropout is disabled)
         tile_elementwise_inout([&p_undrop](auto& x) { x = x * p_undrop; }, d);
 
         store_tile(d_dram_block_window_tmp, d);
+
+        // Sink gradient path: skipped entirely when atomic_sink_grad_ptr is nullptr
+        if(atomic_sink_grad_ptr != nullptr)
+        {
+            // Load LSE only on the sink path to avoid unnecessary global memory reads
+            constexpr auto lse_dstr =
+                make_static_tile_distribution(detail::make_reduce_tile_distribution_encoding(
+                    o.get_tile_distribution().get_static_tile_distribution_encoding(),
+                    sequence<1>{}));
+            auto lse_dram_window =
+                make_tile_window(lse_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 lse_dram_block_window_tmp.get_window_lengths(),
+                                 lse_dram_block_window_tmp.get_window_origin(),
+                                 lse_dstr);
+            auto lse_ = load_tile(lse_dram_window);
+
+            // Compute per-query contribution: -P_sink[q] * D[q]
+            // where P_sink[q] = exp2(sink_value - log2e*lse[q])
+            // sink_value has already been pre-multiplied by log2e at the kernel call site,
+            // so exp2(sink_value - log2e*lse) == exp(raw_sink - lse).
+            // exp2 maps directly to the v_exp_f32 hardware instruction on AMD GPUs.
+            // Always accumulate in float regardless of DDataType to avoid precision loss
+            // and to ensure atomicAdd works correctly on all architectures.
+            auto sink_val_tensor = make_static_distributed_tensor<float>(d_dstr);
+            tile_elementwise_inout(
+                [&](auto& s_out, const auto& l_in, const auto& d_in) {
+                    float p_sink = exp2(type_convert<float>(sink_value) -
+                                        log2e_v<float> * type_convert<float>(l_in));
+                    s_out        = -p_sink * type_convert<float>(d_in);
+                },
+                sink_val_tensor,
+                lse_,
+                d);
+
+            // Reduce contributions held by this thread
+            float thread_sum       = 0.f;
+            constexpr auto s_spans = decltype(sink_val_tensor)::get_distributed_spans();
+            sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                thread_sum += sink_val_tensor(i_idx);
+            });
+
+            // Warp-level reduction: fold thread_sum across lanes so only one
+            // atomicAdd per warp is issued instead of one per thread.
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+            const index_t warp_sz = get_warp_size();
+            for(index_t offset = warp_sz >> 1; offset > 0; offset >>= 1)
+                thread_sum += warp_shuffle_down(thread_sum, offset);
+
+            // Only lane 0 of each warp writes to global memory.
+            // Note: this atomicAdd is non-deterministic across runs regardless of the
+            // -deterministic flag, because d_sink is a single scalar per head accumulated
+            // across all thread-blocks. The practical impact is negligible for this value.
+            if(get_lane_id() == 0)
+                atomicAdd(reinterpret_cast<float*>(atomic_sink_grad_ptr), thread_sum);
+#endif
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
index 97db0f95c4..34ba8d6c47 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
@@ -163,7 +163,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
             amd_wave_read_first_lane(integer_divide_ceil(seqlen_q_end - seqlen_q_start, kM0));
 
         // check early exit if no work to do.
-        if(num_total_loop <= 0)
+        // __builtin_expect is load-bearing: omitting it causes incorrect AGPR allocation in
+        // the dK/dV accumulation loop on some compiler versions, leading to wrong results.
+        if(__builtin_expect(num_total_loop <= 0, 0))
         {
             // Note: here dk_acc&dv_acc are all cleared, return it
             return make_tuple(dk_acc, dv_acc);
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
index 5ac67081dc..f553945a37 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
@@ -67,6 +67,7 @@ struct BlockFmhaBwdPipelineProblem
 template <typename ODataType_,
           typename OGradDataType_,
           typename DDataType_,
+          typename LSEDataType_,
           index_t kBlockSize_,
           index_t kVHeaddim_,
           bool kIsGroupMode_,
@@ -76,6 +77,7 @@ struct BlockFmhaBwdOGradDotOPipelineProblem
     using ODataType     = remove_cvref_t<ODataType_>;
     using OGradDataType = remove_cvref_t<OGradDataType_>;
     using DDataType     = remove_cvref_t<DDataType_>;
+    using LSEDataType   = remove_cvref_t<LSEDataType_>;
     using Traits        = remove_cvref_t<Traits_>;
 
     static_assert(0 < kBlockSize_ && kBlockSize_ % get_warp_size() == 0,
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
index f5166cfdcb..d5ba324326 100644
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -1226,23 +1226,37 @@ struct UniversalGemmKernel
             s_waitcnt_barrier();
             const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles);
             const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
-            const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
-            const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
+            // Apply pivot to M tile index first, then use the same pivoted index
+            // for both data-tile selection and chunk-signal wait.
+            auto iM_eff = amd_wave_read_first_lane(iM);
+
+            if(kargs.async_input_scheduler.chunk_signals != nullptr)
+            {
+                const auto tile_idx_pivot =
+                    amd_wave_read_first_lane(kargs.async_input_scheduler.tile_idx_pivot_m);
+                const auto tiles_m = amd_wave_read_first_lane(
+                    integer_divide_ceil(kargs.M, TilePartitioner::MPerBlock));
+                if(tiles_m > 0)
+                {
+                    iM_eff = amd_wave_read_first_lane((iM_eff + tile_idx_pivot) % tiles_m);
+                }
+            }
+
+            const index_t i_m = amd_wave_read_first_lane(iM_eff * TilePartitioner::MPerBlock);
+            const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
             // Synchronize with producer to ensure input data is ready before processing tile
             if(kargs.async_input_scheduler.chunk_signals != nullptr)
             {
                 const auto tiles_per_chunk =
                     amd_wave_read_first_lane(kargs.async_input_scheduler.tiles_per_chunk_m);
-                const auto tile_idx_pivot =
-                    amd_wave_read_first_lane(kargs.async_input_scheduler.tile_idx_pivot_m);
                 const auto num_chunks =
                     amd_wave_read_first_lane(kargs.async_input_scheduler.num_chunks);
                 if(tiles_per_chunk > 0 && num_chunks > 0)
                 {
                     // Pivot allows rotating chunk assignments for load balancing
-                    const auto chunk_idx = amd_wave_read_first_lane(
-                        ((iM + tile_idx_pivot) / tiles_per_chunk) % num_chunks);
+                    const auto chunk_idx =
+                        amd_wave_read_first_lane((iM_eff / tiles_per_chunk) % num_chunks);
                     workgroup_barrier chunk_barrier(kargs.async_input_scheduler.chunk_signals);
                     chunk_barrier.wait_eq_wave(/*value=*/1, /*offset=*/chunk_idx);
                 }
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
index 801207106b..e353dc8b54 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
@@ -903,13 +903,11 @@ struct GroupedConvolutionBackwardDataKernel
         const auto& d_block_window =
             MakeDBlockWindows(ds_ptr, kargs, group_id, block_idx_m, block_idx_n);
 
-        const index_t num_loop  = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitted_k));
-        const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop);
-        const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
+        const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitted_k));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& c_block_tile = GemmPipeline{}.template operator()(
-            a_block_window, b_block_window, num_loop, has_hot_loop, tail_num, smem_ptr_0);
+            a_block_window, b_block_window, num_loop, smem_ptr_0);
 
         const index_t k_batch = amd_wave_read_first_lane(kargs.k_batch);
 
diff --git a/profiler/include/profiler/grouped_convolution_backward_data_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_backward_data_tile_algs.hpp
index 2fa2019b07..4bbf3eb6db 100644
--- a/profiler/include/profiler/grouped_convolution_backward_data_tile_algs.hpp
+++ b/profiler/include/profiler/grouped_convolution_backward_data_tile_algs.hpp
@@ -65,7 +65,9 @@ run_grouped_conv_backward_data_tile_algs(const ckt::Args<SIGNATURE>& args,
                                          const ckt::Outputs<SIGNATURE>& outputs,
                                          const ck_tile::stream_config& s_conf)
 {
-    float best_avg_time = std::numeric_limits<float>::max();
+    // Run first instance as dummy to get proper time from the first instance
+    bool dummy_run_executed = false;
+    float best_avg_time     = std::numeric_limits<float>::max();
     std::string best_op_name, op_name;
     int best_split_k                = 0;
     ck::index_t best_instance_index = -1;
@@ -121,6 +123,13 @@ run_grouped_conv_backward_data_tile_algs(const ckt::Args<SIGNATURE>& args,
                 run_alg_func(args_k_batch, inputs, outputs, s_conf);
             if(is_supported)
             {
+                if((s_conf.time_kernel_ || s_conf.flush_cache_) && !dummy_run_executed)
+                {
+                    // Run first instance twice
+                    std::tie(is_supported, avg_time, op_name) =
+                        run_alg_func(args_k_batch, inputs, outputs, s_conf);
+                    dummy_run_executed = true;
+                }
                 ckt::ValidationReport report;
                 auto&& [rtol, atol] =
                     get_rtol_atol<SIGNATURE>(num_accums, k_batch, max_accumulated_value);
diff --git a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
index e79fc44e8d..697849a019 100644
--- a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
+++ b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
@@ -106,17 +106,17 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args<SIGNATURE>& args,
         {
             ckt::Args<SIGNATURE> args_k_batch = args;
             args_k_batch.k_batch              = k_batch;
-            if((s_conf.time_kernel_ || s_conf.flush_cache_) && !dummy_run_executed)
-            {
-                // Run first instance twice when profiling to stabilize timing
-                std::tie(is_supported, avg_time, op_name) =
-                    run_alg_func(args_k_batch, inputs, outputs, s_conf);
-                dummy_run_executed = true;
-            }
             std::tie(is_supported, avg_time, op_name) =
                 run_alg_func(args_k_batch, inputs, outputs, s_conf);
             if(is_supported)
             {
+                if((s_conf.time_kernel_ || s_conf.flush_cache_) && !dummy_run_executed)
+                {
+                    // Run first instance twice when profiling to stabilize timing
+                    std::tie(is_supported, avg_time, op_name) =
+                        run_alg_func(args_k_batch, inputs, outputs, s_conf);
+                    dummy_run_executed = true;
+                }
                 ckt::ValidationReport report;
                 auto&& [rtol, atol] =
                     get_rtol_atol<SIGNATURE>(num_accums, k_batch, max_accumulated_value);
diff --git a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp
index 054da8057a..a5e79706be 100644
--- a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp
+++ b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp
@@ -86,15 +86,16 @@ run_grouped_conv_forward_tile_algs(const ckt::Args<SIGNATURE>& args,
     auto ref_conv   = ReferenceInstance{};
     auto ref_result = ckt::run(ref_conv, args, inputs, reference.get());
     auto run_alg    = [&](auto&& run_alg_func) {
-        if(!dummy_run_executed)
-        {
-            // Run first instance twice
-            std::tie(is_supported, avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf);
-            dummy_run_executed                        = true;
-        }
         std::tie(is_supported, avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf);
         if(is_supported)
         {
+            if((s_conf.time_kernel_ || s_conf.flush_cache_) && !dummy_run_executed)
+            {
+                // Run first instance twice
+                std::tie(is_supported, avg_time, op_name) =
+                    run_alg_func(args, inputs, outputs, s_conf);
+                dummy_run_executed = true;
+            }
             best_avg_time = std::min(best_avg_time, avg_time);
             best_op_name  = best_avg_time < avg_time ? best_op_name : op_name;
             std::cout << "Perf: " << std::setw(10) << avg_time << " ms," << " " << op_name
diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
index 937fb24f5a..2d2f9982e9 100644
--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
@@ -197,10 +197,11 @@ bool profile_conv_bwd_data_impl(int do_verification,
     }
 
     std::string best_op_name;
-    float best_avg_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-    int num_kernel        = 0;
+    float best_avg_time     = 0;
+    float best_tflops       = 0;
+    float best_gb_per_sec   = 0;
+    int num_kernel          = 0;
+    bool dummy_run_executed = false;
 
     for(auto& op_ptr : op_ptrs)
     {
@@ -230,16 +231,38 @@ bool profile_conv_bwd_data_impl(int do_verification,
                 // skip test if instance_index is specified
                 continue;
             }
-            // for conv bwd data, some input tensor element are zero, but not written by kernel,
-            // need to set zero
-            in_device_buf.SetZero();
+            if(!time_kernel)
+            {
+                // Don't clear for perf measurement.
+                // For non-grouped solver user has to clear input on his own.
+                // for conv bwd data, some input tensor element are zero, but not written by kernel,
+                // need to set zero
+                in_device_buf.SetZero();
+            }
 
             std::string op_name = op_ptr->GetTypeString();
 
             auto invoker_ptr = op_ptr->MakeInvokerPointer();
 
-            float avg_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            // Run first instance twice to get proper time
+            if(time_kernel && !dummy_run_executed)
+            {
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr,
+                                              time_kernel,
+                                              0 /*log_level*/,
+                                              5 /*cold_iters*/,
+                                              50 /*nrepeat_*/,
+                                              time_kernel /*flush_cache*/});
+                dummy_run_executed = true;
+            }
+            float avg_time = invoker_ptr->Run(argument_ptr.get(),
+                                              StreamConfig{nullptr,
+                                                           time_kernel,
+                                                           0 /*log_level*/,
+                                                           5 /*cold_iters*/,
+                                                           50 /*nrepeat_*/,
+                                                           time_kernel /*flush_cache*/});
 
             std::size_t flop      = conv_param.GetFlops();
             std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index 8a5bf966b7..3d053b1dc1 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -287,6 +287,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     bool pass               = true;
     index_t num_kernel      = 0;
     index_t valid_instances = 0;
+    bool dummy_run_executed = false;
+
     auto run_impl = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) {
         // workspace_sz will be equal to 0 for other layout than NGCHW
         const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
@@ -317,8 +319,25 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
 
             auto invoker_ptr = op_ptr->MakeInvokerPointer();
 
-            float avg_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            // Run first instance twice to get proper time
+            if(time_kernel && !dummy_run_executed)
+            {
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr,
+                                              time_kernel,
+                                              0 /*log_level*/,
+                                              5 /*cold_iters*/,
+                                              50 /*nrepeat_*/,
+                                              time_kernel /*flush_cache*/});
+                dummy_run_executed = true;
+            }
+            float avg_time = invoker_ptr->Run(argument_ptr.get(),
+                                              StreamConfig{nullptr,
+                                                           time_kernel,
+                                                           0 /*log_level*/,
+                                                           5 /*cold_iters*/,
+                                                           50 /*nrepeat_*/,
+                                                           time_kernel /*flush_cache*/});
 
             std::size_t flop      = conv_param.GetFlops();
             std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
@@ -495,7 +514,6 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     {
         std::cout << "\nValid instances for this problem:" << std::endl;
     }
-
     for(auto& op_ptr : op_ptrs)
     {
         for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index 44d000f8c6..24bc67a647 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -296,7 +296,8 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
     index_t best_instance_index = 0;
 
     // profile device op instances
-    bool pass = true;
+    bool pass               = true;
+    bool dummy_run_executed = false;
 
     auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
         // workspace_sz will be equal to 0 for other layout than NGCHW
@@ -331,6 +332,19 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
 
             auto invoker_ptr = op_ptr->MakeInvokerPointer();
 
+            // Run first instance twice to get proper time
+            if(time_kernel && !dummy_run_executed)
+            {
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr,
+                                              time_kernel,
+                                              0 /*log_level*/,
+                                              5 /*cold_iters*/,
+                                              50 /*nrepeat_*/,
+                                              time_kernel /*flush_cache*/});
+                dummy_run_executed = true;
+            }
+
             float avg_time = invoker_ptr->Run(argument_ptr.get(),
                                               StreamConfig{nullptr,
                                                            time_kernel,
@@ -437,30 +451,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
         std::cout << "\nValid instances for this problem:" << std::endl;
     }
 
-    // Run first instance twice to get proper time
-    {
-        auto argument_ptr = op_ptrs[0]->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
-                                                            wei_device_buf.GetDeviceBuffer(),
-                                                            {},
-                                                            out_device_buf.GetDeviceBuffer(),
-                                                            a_g_n_c_wis_lengths,
-                                                            a_g_n_c_wis_strides,
-                                                            b_g_k_c_xs_lengths,
-                                                            b_g_k_c_xs_strides,
-                                                            {},
-                                                            {},
-                                                            e_g_n_k_wos_lengths,
-                                                            e_g_n_k_wos_strides,
-                                                            conv_filter_strides,
-                                                            conv_filter_dilations,
-                                                            input_left_pads,
-                                                            input_right_pads,
-                                                            in_element_op,
-                                                            wei_element_op,
-                                                            out_element_op);
-
-        run_impl(op_ptrs[0], argument_ptr);
-    }
     for(auto& op_ptr : op_ptrs)
     {
         auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
diff --git a/profiler/src/profile_grouped_conv_bwd_data_tile.cpp b/profiler/src/profile_grouped_conv_bwd_data_tile.cpp
index fe51056805..280af3fe00 100644
--- a/profiler/src/profile_grouped_conv_bwd_data_tile.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_data_tile.cpp
@@ -89,7 +89,8 @@ int call_profiler(const ckt::Args<SIGNATURE>& args,
                                    0 /*log_level*/,
                                    5 /*cold_iters*/,
                                    50 /*nrepeat_*/,
-                                   true /*is_gpu_timer_*/});
+                                   true /*is_gpu_timer_*/,
+                                   time_kernel /*flush_cache*/});
     if(time_kernel)
     {
         std::cout << "\nBest configuration parameters:" << "\n\tname: " << op_name << " (instance "
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 320e5b1e91..63bf174643 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -69,4 +69,4 @@ add_subdirectory(fmha)
 add_subdirectory(gemm_tile_engine)
 add_subdirectory(pooling)
 add_subdirectory(grouped_conv)
-add_subdirectory(gemm_streamk_tile_engine)
+add_subdirectory(pooling_tile_engine)
diff --git a/test/ck_tile/core/container/unit_sequence.cpp b/test/ck_tile/core/container/unit_sequence.cpp
index 3769d6ecf9..2ce0d0f7e8 100644
--- a/test/ck_tile/core/container/unit_sequence.cpp
+++ b/test/ck_tile/core/container/unit_sequence.cpp
@@ -355,6 +355,102 @@ TEST(SequenceSort, SortSingleElement)
     EXPECT_TRUE((std::is_same<Result, Expected>::value));
 }
 
+// Test sequence_sort sorted2unsorted_map (index tracking)
+TEST(SequenceSort, SortedMapUnsorted)
+{
+    using Seq  = sequence<5, 2, 8, 1, 9>;
+    using Sort = sequence_sort<Seq, less<index_t>>;
+    using Map  = typename Sort::sorted2unsorted_map;
+    // sorted = <1,2,5,8,9>, original indices = <3,1,0,2,4>
+    using Expected = sequence<3, 1, 0, 2, 4>;
+    EXPECT_TRUE((std::is_same<Map, Expected>::value));
+}
+
+TEST(SequenceSort, SortedMapAlreadySorted)
+{
+    using Seq  = sequence<1, 2, 3, 4, 5>;
+    using Sort = sequence_sort<Seq, less<index_t>>;
+    using Map  = typename Sort::sorted2unsorted_map;
+    // Already sorted: map should be identity
+    using Expected = sequence<0, 1, 2, 3, 4>;
+    EXPECT_TRUE((std::is_same<Map, Expected>::value));
+}
+
+TEST(SequenceSort, SortedMapRoundTrip)
+{
+    // Verify: sorted_values[i] == original[sorted2unsorted_map[i]]
+    using Seq  = sequence<5, 2, 8, 1, 9>;
+    using Sort = sequence_sort<Seq, less<index_t>>;
+    // sorted = <1,2,5,8,9>, map = <3,1,0,2,4>
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(0)), Sort::type::at(0));
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(1)), Sort::type::at(1));
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(2)), Sort::type::at(2));
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(3)), Sort::type::at(3));
+    EXPECT_EQ(Seq::at(Sort::sorted2unsorted_map::at(4)), Sort::type::at(4));
+}
+
+TEST(SequenceSort, SortedMapWithDuplicates)
+{
+    using Seq    = sequence<3, 1, 3, 1>;
+    using Sort   = sequence_sort<Seq, less<index_t>>;
+    using Sorted = typename Sort::type;
+    using Map    = typename Sort::sorted2unsorted_map;
+    // sorted = <1,1,3,3>
+    using ExpectedSorted = sequence<1, 1, 3, 3>;
+    EXPECT_TRUE((std::is_same<Sorted, ExpectedSorted>::value));
+    // Verify round-trip: original[map[i]] == sorted[i] for all i
+    // (don't assert specific index order for duplicates — sort stability may vary)
+    EXPECT_EQ(Seq::at(Map::at(0)), Sorted::at(0));
+    EXPECT_EQ(Seq::at(Map::at(1)), Sorted::at(1));
+    EXPECT_EQ(Seq::at(Map::at(2)), Sorted::at(2));
+    EXPECT_EQ(Seq::at(Map::at(3)), Sorted::at(3));
+}
+
+TEST(SequenceSort, SortedMapReverseSorted)
+{
+    using Seq       = sequence<5, 4, 3, 2, 1>;
+    using Sort      = sequence_sort<Seq, less<index_t>>;
+    using Sorted    = typename Sort::type;
+    using Map       = typename Sort::sorted2unsorted_map;
+    using ExpSorted = sequence<1, 2, 3, 4, 5>;
+    using ExpMap    = sequence<4, 3, 2, 1, 0>;
+    EXPECT_TRUE((std::is_same<Sorted, ExpSorted>::value));
+    EXPECT_TRUE((std::is_same<Map, ExpMap>::value));
+}
+
+TEST(SequenceSort, SortedMapEmpty)
+{
+    using Sort = sequence_sort<sequence<>, less<index_t>>;
+    using Map  = typename Sort::sorted2unsorted_map;
+    EXPECT_TRUE((std::is_same<Map, sequence<>>::value));
+}
+
+TEST(SequenceSort, SortedMapSingleElement)
+{
+    using Sort = sequence_sort<sequence<42>, less<index_t>>;
+    using Map  = typename Sort::sorted2unsorted_map;
+    EXPECT_TRUE((std::is_same<Map, sequence<0>>::value));
+}
+
+// Test sequence_unique_sort sorted2unsorted_map
+TEST(SequenceUniqueSort, UniqueSortMap)
+{
+    using Seq    = sequence<3, 1, 4, 1, 5, 9, 2, 6, 5>;
+    using Result = sequence_unique_sort<Seq, less<index_t>, equal<index_t>>;
+    using Map    = typename Result::sorted2unsorted_map;
+    // sorted unique = <1,2,3,4,5,6,9>
+    // The map should reference the first occurrence of each unique value in the original
+    // Verify round-trip: for each i, original[map[i]] == sorted_unique[i]
+    using Values = typename Result::type;
+    EXPECT_EQ(Seq::at(Map::at(0)), Values::at(0)); // 1
+    EXPECT_EQ(Seq::at(Map::at(1)), Values::at(1)); // 2
+    EXPECT_EQ(Seq::at(Map::at(2)), Values::at(2)); // 3
+    EXPECT_EQ(Seq::at(Map::at(3)), Values::at(3)); // 4
+    EXPECT_EQ(Seq::at(Map::at(4)), Values::at(4)); // 5
+    EXPECT_EQ(Seq::at(Map::at(5)), Values::at(5)); // 6
+    EXPECT_EQ(Seq::at(Map::at(6)), Values::at(6)); // 9
+}
+
 // Test sequence_unique_sort
 TEST(SequenceUniqueSort, UniqueSort)
 {
@@ -405,6 +501,24 @@ TEST(SequenceMap, InvalidMapMissing)
     EXPECT_FALSE((is_valid_sequence_map<Map>::value));
 }
 
+TEST(SequenceMap, InvalidMapNegative)
+{
+    using Map = sequence<0, -1, 2>;
+    EXPECT_FALSE((is_valid_sequence_map<Map>::value));
+}
+
+TEST(SequenceMap, ValidMapSingleElement)
+{
+    EXPECT_TRUE((is_valid_sequence_map<sequence<0>>::value));
+}
+
+TEST(SequenceMap, InvalidMapSingleElement)
+{
+    EXPECT_FALSE((is_valid_sequence_map<sequence<1>>::value));
+}
+
+TEST(SequenceMap, ValidMapEmpty) { EXPECT_TRUE((is_valid_sequence_map<sequence<>>::value)); }
+
 // Test sequence_map_inverse
 // Note: sequence_map_inverse inverts a mapping where Map[i] = j means old position i maps to new
 // position j The inverse gives us new position i came from old position inverse[i]
diff --git a/test/ck_tile/fmha/test_fmha_bwd.cpp b/test/ck_tile/fmha/test_fmha_bwd.cpp
index e1035bffe4..3aee76131e 100644
--- a/test/ck_tile/fmha/test_fmha_bwd.cpp
+++ b/test/ck_tile/fmha/test_fmha_bwd.cpp
@@ -91,7 +91,8 @@ void fmha_bwd_test(const FmhaBwdTestParam& param)
         drop_offset,
         drop_prefs,
         mask_str,
-        det, // deterministic
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -333,7 +334,8 @@ TEST_P(BasicQPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -419,7 +421,8 @@ TEST_P(BasicKVPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -513,7 +516,8 @@ TEST_P(QKVPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -620,7 +624,8 @@ TEST_P(ZeroLengthPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -741,7 +746,8 @@ TEST_P(VariedPaddingRatios, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -843,7 +849,8 @@ TEST_P(PaddingWithMask, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
@@ -977,7 +984,8 @@ TEST_P(MultiBatchPadding, DataTypeConfig)
         drop_offset,
         drop_prefs,
         mask_str,
-        det,
+        false, // sink_grad
+        det,   // deterministic
         init_method,
         static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))),
         1,
diff --git a/test/ck_tile/gemm_streamk/CMakeLists.txt b/test/ck_tile/gemm_streamk/CMakeLists.txt
index 636900db8e..2c5b3bb04c 100644
--- a/test/ck_tile/gemm_streamk/CMakeLists.txt
+++ b/test/ck_tile/gemm_streamk/CMakeLists.txt
@@ -19,55 +19,93 @@ set(EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4
 if(GPU_TARGETS MATCHES "gfx90a|gfx942|gfx950")
 
     include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR})
- 
+
     #TODO: support all arches
     #TODO: current c-shuffle only supports C layout as R
     add_gtest_executable(test_ck_tile_streamk_tile_partitioner test_streamk_tile_partitioner.cpp)
-    set(STREAMK_EXTENDED_SOURCES
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_persistent_compv3.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_persistent_compv4.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_persistent_mem.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_persistent_compv3.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_persistent_compv4.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_persistent_mem.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv3.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv4.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp16_nonpersistent_mem.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv3.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv4.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf16_nonpersistent_mem.cpp
-        test_gemm_streamk_util.cpp)
 
-    # We only test fp8 and bf8 on gfx942 and gfx950 since these types are not natively supported on gfx90a
-    if(GPU_TARGETS MATCHES "gfx942|gfx950")
-        list(APPEND STREAMK_EXTENDED_SOURCES
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_persistent_compv3.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_persistent_compv4.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_persistent_mem.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_persistent_compv3.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_persistent_compv4.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_persistent_mem.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv3.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv4.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_fp8_nonpersistent_mem.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv3.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv4.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/extended_tests/test_gemm_streamk_bf8_nonpersistent_mem.cpp)
-    endif()
+    # ---- Code-generate test .cpp files from types header ----
+    set(STREAMK_TYPES_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/test_gemm_streamk_types.hpp)
+    set(STREAMK_GEN_SCRIPT   ${CMAKE_CURRENT_SOURCE_DIR}/generate_test_files.py)
 
-    add_gtest_executable(test_ck_tile_streamk_extended ${STREAMK_EXTENDED_SOURCES})
-    target_compile_options(test_ck_tile_streamk_extended PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    # Re-run configure automatically if the types header changes (e.g. types added/removed)
+    # or if the generation script changes
+    set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${STREAMK_TYPES_HEADER} ${STREAMK_GEN_SCRIPT})
+
+    # Define the targets and their corresponding executable names
+    set(STREAMK_GEN_TARGETS extended atomic_smoke linear_smoke tree_smoke pipelines_smoke)
+    set(STREAMK_GEN_EXEC_EXTENDED         test_ck_tile_streamk_extended)
+    set(STREAMK_GEN_EXEC_ATOMIC_SMOKE     test_ck_tile_streamk_atomic_smoke)
+    set(STREAMK_GEN_EXEC_LINEAR_SMOKE     test_ck_tile_streamk_linear_smoke)
+    set(STREAMK_GEN_EXEC_TREE_SMOKE       test_ck_tile_streamk_tree_smoke)
+    set(STREAMK_GEN_EXEC_PIPELINES_SMOKE  test_ck_tile_streamk_pipelines_smoke)
 
     # Collect all test targets for umbrella label
     set(CK_TILE_GEMM_STREAMK_TEST_TARGETS
-        test_ck_tile_streamk_tile_partitioner
-        test_ck_tile_streamk_extended
+        test_ck_tile_streamk_tile_partitioner)
+
+    foreach(target IN LISTS STREAMK_GEN_TARGETS)
+        string(TOUPPER ${target} TARGET_UPPER)
+        set(GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/${target})
+        set(EXEC_NAME ${STREAMK_GEN_EXEC_${TARGET_UPPER}})
+        set(LIST_FILE ${CMAKE_CURRENT_BINARY_DIR}/${target}_files.txt)
+
+        # Phase 1 (configure time): discover the list of files that will be generated
+        execute_process(
+            COMMAND ${Python3_EXECUTABLE} ${STREAMK_GEN_SCRIPT}
+                --types_header ${STREAMK_TYPES_HEADER}
+                --output_dir ${GEN_DIR}
+                --target ${target}
+                --list_files ${LIST_FILE}
+            RESULT_VARIABLE ret
+            ERROR_VARIABLE list_files_stderr)
+        if(ret AND NOT ret EQUAL 0)
+            message(FATAL_ERROR 
+                "Failed to list ${target} test files via Python: ${ret}\n"
+                "stderr: ${list_files_stderr}"
+            )
+        endif()
+        file(STRINGS ${LIST_FILE} ALL_SOURCES_${target})
+
+        # Phase 2 (build time): generate the .cpp files when the types header changes
+        add_custom_command(
+            OUTPUT ${ALL_SOURCES_${target}}
+            COMMAND ${Python3_EXECUTABLE} ${STREAMK_GEN_SCRIPT}
+                --types_header ${STREAMK_TYPES_HEADER}
+                --output_dir ${GEN_DIR}
+                --target ${target}
+                --gen_files
+            DEPENDS ${STREAMK_TYPES_HEADER} ${STREAMK_GEN_SCRIPT}
+            COMMENT "Generating StreamK ${target} test sources from types header")
+
+        # Filter out fp8/bf8 sources on gfx90a since those types are not natively supported
+        set(FILTERED_SOURCES)
+        foreach(src IN LISTS ALL_SOURCES_${target})
+            if(NOT src MATCHES "_(fp8|bf8)_" OR GPU_TARGETS MATCHES "gfx942|gfx950")
+                list(APPEND FILTERED_SOURCES ${src})
+            endif()
+        endforeach()
+        list(APPEND FILTERED_SOURCES test_gemm_streamk_util.cpp)
+
+        add_gtest_executable(${EXEC_NAME} ${FILTERED_SOURCES})
+        target_compile_options(${EXEC_NAME} PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+        list(APPEND CK_TILE_GEMM_STREAMK_TEST_TARGETS ${EXEC_NAME})
+    endforeach()
+
+    # Add python unit tests to validate the code gen logic in generate_test_files.py
+    add_test(
+        NAME test_ck_tile_streamk_generate_test_files
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_generate_test_files.py -v
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..
     )
 
     # Label all ck_tile gemm_streamk tests with CK_TILE_GEMM_STREAMK_TESTS for selective execution
     foreach(test_target ${CK_TILE_GEMM_STREAMK_TEST_TARGETS})
         set_tests_properties(${test_target} PROPERTIES LABELS "CK_TILE_GEMM_STREAMK_TESTS")
     endforeach()
+    # Also label the Python test
+    set_tests_properties(test_ck_tile_streamk_generate_test_files PROPERTIES LABELS "CK_TILE_GEMM_STREAMK_TESTS")
 
     # Umbrella target to build and run all ck_tile gemm_streamk tests
     # Usage: ninja ck_tile_gemm_streamk_tests
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv3.cpp
deleted file mode 100644
index 2e35690b3d..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv3.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16NonPersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16NonPersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16NonPersistentCompV3,
-                 KernelTypesStreamKBf16NonPersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv4.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv4.cpp
deleted file mode 100644
index e0e1b30065..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_compv4.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16NonPersistentCompV4 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16NonPersistentCompV4
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16NonPersistentCompV4,
-                 KernelTypesStreamKBf16NonPersistentCompV4);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_mem.cpp
deleted file mode 100644
index ab1dbffcdb..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_nonpersistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16NonPersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16NonPersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16NonPersistentMem, KernelTypesStreamKBf16NonPersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_compv3.cpp
deleted file mode 100644
index 24385201a1..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16PersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16PersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16PersistentCompV3, KernelTypesStreamKBf16PersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_compv4.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_compv4.cpp
deleted file mode 100644
index 2c7a40cea9..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_compv4.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16PersistentCompV4 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16PersistentCompV4
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16PersistentCompV4, KernelTypesStreamKBf16PersistentCompV4);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_mem.cpp
deleted file mode 100644
index 94f9def529..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf16_persistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf16PersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf16PersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf16PersistentMem, KernelTypesStreamKBf16PersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv3.cpp
deleted file mode 100644
index a0a04d79e2..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8NonPersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8NonPersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8NonPersistentCompV3, KernelTypesStreamKBf8NonPersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv4.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv4.cpp
deleted file mode 100644
index 5fada00248..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_compv4.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8NonPersistentCompV4 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8NonPersistentCompV4
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8NonPersistentCompV4, KernelTypesStreamKBf8NonPersistentCompV4);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_mem.cpp
deleted file mode 100644
index 5a6447416d..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_nonpersistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8NonPersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8NonPersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8NonPersistentMem, KernelTypesStreamKBf8NonPersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_compv3.cpp
deleted file mode 100644
index 0a6c2346d8..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8PersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8PersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8PersistentCompV3, KernelTypesStreamKBf8PersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_compv4.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_compv4.cpp
deleted file mode 100644
index cd48886f84..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_compv4.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8PersistentCompV4 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8PersistentCompV4
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8PersistentCompV4, KernelTypesStreamKBf8PersistentCompV4);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_mem.cpp
deleted file mode 100644
index 1eef56c971..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_bf8_persistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKBf8PersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKBf8PersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKBf8PersistentMem, KernelTypesStreamKBf8PersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv3.cpp
deleted file mode 100644
index 3381554d1e..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv3.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16NonPersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16NonPersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16NonPersistentCompV3,
-                 KernelTypesStreamKFp16NonPersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv4.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv4.cpp
deleted file mode 100644
index e6b632b0b1..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_compv4.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16NonPersistentCompV4 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16NonPersistentCompV4
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16NonPersistentCompV4,
-                 KernelTypesStreamKFp16NonPersistentCompV4);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_mem.cpp
deleted file mode 100644
index 2f7dd7be33..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_nonpersistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16NonPersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16NonPersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16NonPersistentMem, KernelTypesStreamKFp16NonPersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_compv3.cpp
deleted file mode 100644
index 3c041a3652..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16PersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16PersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16PersistentCompV3, KernelTypesStreamKFp16PersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_compv4.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_compv4.cpp
deleted file mode 100644
index 8117a7ce96..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_compv4.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16PersistentCompV4 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16PersistentCompV4
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16PersistentCompV4, KernelTypesStreamKFp16PersistentCompV4);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_mem.cpp
deleted file mode 100644
index c05135943f..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp16_persistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp16PersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp16PersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp16PersistentMem, KernelTypesStreamKFp16PersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv3.cpp
deleted file mode 100644
index 379702a10a..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8NonPersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8NonPersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8NonPersistentCompV3, KernelTypesStreamKFp8NonPersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv4.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv4.cpp
deleted file mode 100644
index bf4dfc30f8..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_compv4.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8NonPersistentCompV4 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8NonPersistentCompV4
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8NonPersistentCompV4, KernelTypesStreamKFp8NonPersistentCompV4);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_mem.cpp
deleted file mode 100644
index 3d545a61c6..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_nonpersistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8NonPersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8NonPersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8NonPersistentMem, KernelTypesStreamKFp8NonPersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_compv3.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_compv3.cpp
deleted file mode 100644
index dccdcaf270..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_compv3.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8PersistentCompV3 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8PersistentCompV3
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8PersistentCompV3, KernelTypesStreamKFp8PersistentCompV3);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_compv4.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_compv4.cpp
deleted file mode 100644
index 8cbab5c8f8..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_compv4.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8PersistentCompV4 : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8PersistentCompV4
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8PersistentCompV4, KernelTypesStreamKFp8PersistentCompV4);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_mem.cpp b/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_mem.cpp
deleted file mode 100644
index 88ebdf1e55..0000000000
--- a/test/ck_tile/gemm_streamk/extended_tests/test_gemm_streamk_fp8_persistent_mem.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_streamk_common_includes.hpp"
-
-template <typename Tuple>
-class TestCkTileStreamKFp8PersistentMem : public TestCkTileStreamK<Tuple>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileStreamKFp8PersistentMem
-
-TYPED_TEST_SUITE(TestCkTileStreamKFp8PersistentMem, KernelTypesStreamKFp8PersistentMem);
-
-#include "test_gemm_streamk_extended_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/generate_test_files.py b/test/ck_tile/gemm_streamk/generate_test_files.py
new file mode 100644
index 0000000000..61a28c2a46
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/generate_test_files.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""Generate test .cpp files from KernelTypes definitions in
+test_gemm_streamk_types.hpp.
+
+Two modes:
+  --list_files FILE   Write the list of output file paths to FILE (one per line)
+                      without generating the files.  Used at CMake configure time.
+  --gen_files         Actually emit the .cpp files into --output_dir.
+                      Used at build time via add_custom_command.
+
+Target selection (--target):
+  extended          Kernel types containing 'Atomic' or 'Pipelines'
+                    -> includes test_gemm_streamk_extended_cases.inc
+  atomic_smoke      Kernel types containing 'Atomic' (not 'Pipelines')
+                    -> includes test_gemm_streamk_atomic_cases.inc
+  linear_smoke      Kernel types containing 'Linear' (not 'Pipelines')
+                    -> includes test_gemm_streamk_reduction_cases.inc
+  tree_smoke        Kernel types containing 'Tree' (not 'Pipelines')
+                    -> includes test_gemm_streamk_reduction_cases.inc
+  pipelines_smoke   Kernel types matching 'Pipelines'
+                    -> includes test_gemm_streamk_reduction_cases.inc
+                       and test_gemm_streamk_atomic_cases.inc
+"""
+
+import argparse
+import os
+import re
+import sys
+
+# --------------------------------------------------------------------------- #
+# Template for every generated .cpp file
+# --------------------------------------------------------------------------- #
+CPP_TEMPLATE = """\
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "test_gemm_streamk_common_includes.hpp"
+
+template <typename Tuple>
+class {class_name} : public TestCkTileStreamK<Tuple>
+{{
+}};
+
+#define TEST_SUITE_NAME {class_name}
+
+TYPED_TEST_SUITE({class_name}, {type_alias});
+
+{inc_includes}
+
+#undef TEST_SUITE_NAME
+"""
+
+# --------------------------------------------------------------------------- #
+# Target definitions: filter predicate and .inc files
+# --------------------------------------------------------------------------- #
+TARGETS = {
+    "extended": {
+        "filter": lambda suffix: "Atomic" in suffix or suffix == "Pipelines",
+        "inc_files": ["test_gemm_streamk_extended_cases.inc"],
+    },
+    "atomic_smoke": {
+        "filter": lambda suffix: "Atomic" in suffix and suffix != "Pipelines",
+        "inc_files": ["test_gemm_streamk_atomic_cases.inc"],
+    },
+    "linear_smoke": {
+        "filter": lambda suffix: "Linear" in suffix and suffix != "Pipelines",
+        "inc_files": ["test_gemm_streamk_reduction_cases.inc"],
+    },
+    "tree_smoke": {
+        "filter": lambda suffix: "Tree" in suffix and suffix != "Pipelines",
+        "inc_files": ["test_gemm_streamk_reduction_cases.inc"],
+    },
+    "pipelines_smoke": {
+        "filter": lambda suffix: suffix == "Pipelines",
+        "inc_files": [
+            "test_gemm_streamk_reduction_cases.inc",
+            "test_gemm_streamk_atomic_cases.inc",
+        ],
+    },
+}
+
+# --------------------------------------------------------------------------- #
+# Mapping from CamelCase suffix fragments to file-name fragments
+# --------------------------------------------------------------------------- #
+KNOWN_TOKENS = [
+    ("Fp16", "fp16"),
+    ("Bf16", "bf16"),
+    ("Fp8", "fp8"),
+    ("Bf8", "bf8"),
+    ("NonPersistent", "nonpersistent"),
+    ("Persistent", "persistent"),
+    ("Atomic", "atomic"),
+    ("Linear", "linear"),
+    ("Tree", "tree"),
+    ("CompV3", "compv3"),
+    ("Pipelines", "pipelines"),
+]
+
+
+def suffix_to_file_tag(suffix: str) -> str:
+    """Convert a CamelCase suffix like 'Fp16PersistentAtomicCompV3' to
+    'fp16_persistent_atomic_compv3'."""
+    parts: list[str] = []
+    remaining = suffix
+    while remaining:
+        matched = False
+        for token, replacement in KNOWN_TOKENS:
+            if remaining.startswith(token):
+                parts.append(replacement)
+                remaining = remaining[len(token) :]
+                matched = True
+                break
+        if not matched:
+            raise ValueError(
+                f"Unrecognised token in KernelTypes suffix: '{remaining}' "
+                f"(from '{suffix}')"
+            )
+    return "_".join(parts)
+
+
+def parse_types_header(header_path: str, target: str) -> list[dict]:
+    """Return a list of dicts with keys: type_alias, class_name, file_tag, suffix."""
+    target_def = TARGETS[target]
+    # Pattern matches lines like: using KernelTypesStreamKFp16PersistentAtomicCompV3 = ...
+    pattern = re.compile(r"using\s+(KernelTypesStreamK(\w+))\s*=")
+    entries: list[dict] = []
+    with open(header_path) as f:
+        for line in f:
+            match = pattern.search(line)
+            if match:
+                # If the match is: using KernelTypesStreamKFp16PersistentAtomicCompV3 = ...
+                # type_alias is KernelTypesStreamKFp16PersistentAtomicCompV3
+                # suffix is Fp16PersistentAtomicCompV3
+                type_alias = match.group(1)
+                suffix = match.group(2)
+                if not target_def["filter"](suffix):
+                    continue
+                entries.append(
+                    {
+                        "type_alias": type_alias,
+                        "class_name": f"TestCkTileStreamK{suffix}",
+                        "file_tag": suffix_to_file_tag(suffix),
+                    }
+                )
+    return entries
+
+
+def output_path(output_dir: str, entry: dict) -> str:
+    return os.path.join(output_dir, f"test_gemm_streamk_{entry['file_tag']}.cpp")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--types_header", required=True, help="Path to test_gemm_streamk_types.hpp"
+    )
+    parser.add_argument(
+        "--output_dir", required=True, help="Directory for generated .cpp files"
+    )
+    parser.add_argument(
+        "--target",
+        required=True,
+        choices=list(TARGETS.keys()),
+        help="Which target to generate files for",
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        "--list_files",
+        metavar="FILE",
+        help="Write output file paths to FILE then exit",
+    )
+    group.add_argument(
+        "--gen_files", action="store_true", help="Generate the .cpp files"
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    entries = parse_types_header(args.types_header, args.target)
+    if not entries:
+        print(
+            f"ERROR: no KernelTypesStreamK* definitions found for target "
+            f"'{args.target}' in {args.types_header}",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    inc_files = TARGETS[args.target]["inc_files"]
+    inc_includes = "\n".join(f'#include "{f}"' for f in inc_files)
+
+    if args.list_files:
+        os.makedirs(os.path.dirname(args.list_files) or ".", exist_ok=True)
+        with open(args.list_files, "w") as f:
+            for entry in entries:
+                f.write(output_path(args.output_dir, entry) + "\n")
+    else:
+        os.makedirs(args.output_dir, exist_ok=True)
+        for entry in entries:
+            path = output_path(args.output_dir, entry)
+            content = CPP_TEMPLATE.format(
+                class_name=entry["class_name"],
+                type_alias=entry["type_alias"],
+                inc_includes=inc_includes,
+            )
+            with open(path, "w") as f:
+                f.write(content)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_atomic_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_atomic_cases.inc
new file mode 100644
index 0000000000..4bd6e9d973
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_atomic_cases.inc
@@ -0,0 +1,47 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_EdgeCase)
+{
+    ck_tile::index_t M = 256;
+    ck_tile::index_t N = 256;
+    ck_tile::index_t K = 256;
+
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_DPOnly)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    // For DP only, we ensure that the number of tiles is a multiple of the number of CUs. This
+    // assumes tile sizes are large enough such that occupancy is 1.
+    ck_tile::index_t M = M_Tile * num_cu;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile;
+
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    // For SK only, we have 4 macro tiles in C. But, we need to make sure there is enough work along
+    // the K dimension to avoid falling into the edge case. Thus, we always have at least num_cu
+    // macro tiles in the K dimension. This assumes tile sizes are large enough such that occupancy
+    // is 1.
+    ck_tile::index_t M = M_Tile * 2;
+    ck_tile::index_t N = N_Tile * 2;
+    ck_tile::index_t K = K_Tile * num_cu;
+
+    this->Run(M, N, K);
+}
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc
new file mode 100644
index 0000000000..e05969c1c7
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_reduction_cases.inc
@@ -0,0 +1,46 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_OneTile)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile * num_cu;
+
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_4Tiles_Reduction)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile * 4;
+    ck_tile::index_t N = N_Tile;
+    ck_tile::index_t K = K_Tile * num_cu + (25 * K_Tile);
+
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_SKOnly_21Tiles)
+{
+    const ck_tile::index_t num_cu     = get_cu_count();
+    constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, TypeParam>::value;
+    constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, TypeParam>::value;
+    constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, TypeParam>::value;
+
+    ck_tile::index_t M = M_Tile * 3;
+    ck_tile::index_t N = N_Tile * 7;
+    ck_tile::index_t K = K_Tile * num_cu + (30 * K_Tile);
+
+    this->Run(M, N, K);
+}
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
index bfe236b37f..cbd3f0f066 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
@@ -14,16 +14,28 @@ using BF16 = ck_tile::bf16_t;
 using BF8  = ck_tile::bf8_t;
 using F32  = float;
 
+// Layouts
 using Row = ck_tile::tensor_layout::gemm::RowMajor;
 using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
 
+// Persistence
+using Persistent    = std::true_type;
+using NonPersistent = std::false_type;
+
+// Pipelines
 using Mem    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
 using CompV3 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
 using CompV4 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
 
-using Persistent    = std::true_type;
-using NonPersistent = std::false_type;
+// Reduction Strategies
+using Atomic = ck_tile::integral_constant<ck_tile::StreamKReductionStrategy,
+                                          ck_tile::StreamKReductionStrategy::Atomic>;
+using Linear = ck_tile::integral_constant<ck_tile::StreamKReductionStrategy,
+                                          ck_tile::StreamKReductionStrategy::Linear>;
+using Tree   = ck_tile::integral_constant<ck_tile::StreamKReductionStrategy,
+                                          ck_tile::StreamKReductionStrategy::Tree>;
 
+using I16  = ck_tile::number<16>;
 using I32  = ck_tile::number<32>;
 using I128 = ck_tile::number<128>;
 using I256 = ck_tile::number<256>;
@@ -32,180 +44,157 @@ using I256 = ck_tile::number<256>;
 
 // ========================== CompV3 Pipeline ==========================
 
-using KernelTypesStreamKFp16PersistentCompV3 = ::testing::Types<
-//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  Persistent    Pipeline
+// Atomics
+using KernelTypesStreamKFp16PersistentAtomicCompV3 = ::testing::Types<
+//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  M_WaveTile  N_WaveTile  K_WaveTile  Persistent    Pipeline  ReductionStrategy
 
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV3>
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,       Persistent,   CompV3,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,       Persistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,       Persistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,       Persistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKBf16PersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV3>
+using KernelTypesStreamKBf16PersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKBf8PersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV3>
+using KernelTypesStreamKBf8PersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKFp8PersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV3>
+using KernelTypesStreamKFp8PersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKFp16NonPersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV3>
+using KernelTypesStreamKFp16NonPersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKBf16NonPersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV3>
+using KernelTypesStreamKBf16NonPersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKBf8NonPersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV3>
+using KernelTypesStreamKBf8NonPersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>
 >;
 
-using KernelTypesStreamKFp8NonPersistentCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV3>
+using KernelTypesStreamKFp8NonPersistentAtomicCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>,
+    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Atomic>
 >;
 
-// ========================== CompV4 Pipeline ==========================
+// Linear
+using KernelTypesStreamKFp16PersistentLinearCompV3 = ::testing::Types<
+//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  M_WaveTile  N_WaveTile  K_WaveTile  Persistent    Pipeline  ReductionStrategy
 
-using KernelTypesStreamKFp16PersistentCompV4 = ::testing::Types<
-//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  Persistent    Pipeline
-
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV4>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV4>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV4>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   CompV4>
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I16,       I16,       I16,     Persistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKBf16PersistentCompV4 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV4>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV4>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV4>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   CompV4>
+using KernelTypesStreamKBf16PersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKBf8PersistentCompV4 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV4>,
-    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV4>,
-    std::tuple<    Col,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV4>,
-    std::tuple<    Col,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   CompV4>
+using KernelTypesStreamKBf8PersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKFp8PersistentCompV4 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV4>,
-    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV4>,
-    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV4>,
-    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   CompV4>
+using KernelTypesStreamKFp8PersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKFp16NonPersistentCompV4 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV4>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV4>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV4>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   CompV4>
+using KernelTypesStreamKFp16NonPersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKBf16NonPersistentCompV4 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV4>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV4>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV4>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   CompV4>
+using KernelTypesStreamKBf16NonPersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKBf8NonPersistentCompV4 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV4>,
-    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV4>,
-    std::tuple<    Col,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV4>,
-    std::tuple<    Col,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   CompV4>
+using KernelTypesStreamKBf8NonPersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>
 >;
 
-using KernelTypesStreamKFp8NonPersistentCompV4 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV4>,
-    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV4>,
-    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV4>,
-    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   CompV4>
+using KernelTypesStreamKFp8NonPersistentLinearCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>,
+    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Linear>
 >;
 
-// ============================= Mem Pipeline =============================
+// Tree
+using KernelTypesStreamKFp16PersistentTreeCompV3 = ::testing::Types<
+//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType  M_MacroTile  N_MacroTile  K_MacroTile  M_WaveTile  N_WaveTile  K_WaveTile  Persistent    Pipeline  ReductionStrategy
 
-using KernelTypesStreamKFp16PersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     Persistent,   Mem>
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I16,       I16,       I16,     Persistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,   CompV3,   Tree>
 >;
 
-using KernelTypesStreamKBf16PersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    Persistent,   Mem>
+using KernelTypesStreamKBf16PersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>
 >;
 
-using KernelTypesStreamKBf8PersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    Persistent,   Mem>
+using KernelTypesStreamKBf8PersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>
 >;
 
-using KernelTypesStreamKFp8PersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,    Persistent,   Mem>
+using KernelTypesStreamKFp8PersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    Persistent,   CompV3,   Tree>
 >;
 
-using KernelTypesStreamKFp16NonPersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,     NonPersistent,   Mem>
+using KernelTypesStreamKFp16NonPersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV3,   Tree>
 >;
 
-using KernelTypesStreamKBf16NonPersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,    NonPersistent,   Mem>
+using KernelTypesStreamKBf16NonPersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16,         I256,        I256,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>
 >;
 
-using KernelTypesStreamKBf8NonPersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,    NonPersistent,   Mem>
+using KernelTypesStreamKBf8NonPersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,        BF8,      BF8,        F32,       BF16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>
 >;
 
-using KernelTypesStreamKFp8NonPersistentMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   Mem>,
-    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,    NonPersistent,   Mem>
+using KernelTypesStreamKFp8NonPersistentTreeCompV3 = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Row,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>,
+    std::tuple<    Col,     Row,     Row,       F8,         F8,        F32,        F16,         I128,        I128,        I32,         I32,       I32,       I16,    NonPersistent,   CompV3,   Tree>
 >;
 
+// ============================= Other Pipelines =============================
+
+using KernelTypesStreamKPipelines = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,      Mem,      Atomic>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   Mem,      Tree>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,      Mem,      Linear>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,      CompV4,   Atomic>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     NonPersistent,   CompV4,   Tree>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,         I256,        I256,        I32,         I32,       I32,       I16,     Persistent,      CompV4,   Linear>
+>;
 // clang-format on
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
index 8ae1f27e5c..0d2cfe207a 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
@@ -71,23 +71,27 @@ template <typename Tuple>
 class TestCkTileStreamK : public ::testing::Test
 {
     protected:
-    using ALayout                            = std::tuple_element_t<0, Tuple>;
-    using BLayout                            = std::tuple_element_t<1, Tuple>;
-    using CLayout                            = std::tuple_element_t<2, Tuple>;
-    using ADataType                          = std::tuple_element_t<3, Tuple>;
-    using BDataType                          = std::tuple_element_t<4, Tuple>;
-    using AccDataType                        = std::tuple_element_t<5, Tuple>;
-    using CDataType                          = std::tuple_element_t<6, Tuple>;
-    using DsLayout                           = ck_tile::tuple<>;
-    using DsDataType                         = ck_tile::tuple<>;
-    static constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, Tuple>::value;
-    static constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, Tuple>::value;
-    static constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, Tuple>::value;
-    static constexpr bool Persistent         = std::tuple_element_t<10, Tuple>::value;
-    static constexpr auto PipelineType       = std::tuple_element_t<11, Tuple>::value;
+    using ALayout                                 = std::tuple_element_t<0, Tuple>;
+    using BLayout                                 = std::tuple_element_t<1, Tuple>;
+    using CLayout                                 = std::tuple_element_t<2, Tuple>;
+    using ADataType                               = std::tuple_element_t<3, Tuple>;
+    using BDataType                               = std::tuple_element_t<4, Tuple>;
+    using AccDataType                             = std::tuple_element_t<5, Tuple>;
+    using CDataType                               = std::tuple_element_t<6, Tuple>;
+    using DsLayout                                = ck_tile::tuple<>;
+    using DsDataType                              = ck_tile::tuple<>;
+    static constexpr ck_tile::index_t M_Tile      = std::tuple_element_t<7, Tuple>::value;
+    static constexpr ck_tile::index_t N_Tile      = std::tuple_element_t<8, Tuple>::value;
+    static constexpr ck_tile::index_t K_Tile      = std::tuple_element_t<9, Tuple>::value;
+    static constexpr ck_tile::index_t M_Warp_Tile = std::tuple_element_t<10, Tuple>::value;
+    static constexpr ck_tile::index_t N_Warp_Tile = std::tuple_element_t<11, Tuple>::value;
+    static constexpr ck_tile::index_t K_Warp_Tile = std::tuple_element_t<12, Tuple>::value;
 
-    template <ck_tile::StreamKReductionStrategy ReductionStrategy,
-              bool PadM       = true,
+    static constexpr bool Persistent        = std::tuple_element_t<13, Tuple>::value;
+    static constexpr auto PipelineType      = std::tuple_element_t<14, Tuple>::value;
+    static constexpr auto ReductionStrategy = std::tuple_element_t<15, Tuple>::value;
+
+    template <bool PadM       = true,
               bool PadN       = true,
               bool PadK       = true,
               bool Preshuffle = false,
@@ -99,10 +103,6 @@ class TestCkTileStreamK : public ::testing::Test
         constexpr ck_tile::index_t N_Warp = 2;
         constexpr ck_tile::index_t K_Warp = 1;
 
-        constexpr ck_tile::index_t M_Warp_Tile = 32;
-        constexpr ck_tile::index_t N_Warp_Tile = 32;
-        constexpr ck_tile::index_t K_Warp_Tile = 16;
-
         constexpr bool kPadM      = PadM;
         constexpr bool kPadN      = PadN;
         constexpr bool kPadK      = PadK;
@@ -269,8 +269,7 @@ class TestCkTileStreamK : public ::testing::Test
                                       stride_C};
 
         ck_tile::index_t num_accumulations_per_tile =
-            invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
-                args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
+            invoke_streamk<>(args, ck_tile::stream_config{nullptr, false, 0, 0, 1});
 
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
 
diff --git a/test/ck_tile/gemm_streamk/test_generate_test_files.py b/test/ck_tile/gemm_streamk/test_generate_test_files.py
new file mode 100644
index 0000000000..7b904da319
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_generate_test_files.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import unittest
+from unittest.mock import mock_open, patch
+from generate_test_files import suffix_to_file_tag, parse_types_header, output_path
+
+# ------------------------------------------------------------ #
+# Unit tests for helper functions in generate_test_files.py
+# ------------------------------------------------------------ #
+
+
+class TestSuffixToFileTag(unittest.TestCase):
+    def test_fp16_token(self):
+        suffix = "Fp16"
+        expected_tag = "fp16"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_bf16_token(self):
+        suffix = "Bf16"
+        expected_tag = "bf16"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_fp8_token(self):
+        suffix = "Fp8"
+        expected_tag = "fp8"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_bf8_token(self):
+        suffix = "Bf8"
+        expected_tag = "bf8"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_nonpersistent_token(self):
+        suffix = "NonPersistent"
+        expected_tag = "nonpersistent"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_persistent_token(self):
+        suffix = "Persistent"
+        expected_tag = "persistent"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_atomic_token(self):
+        suffix = "Atomic"
+        expected_tag = "atomic"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_linear_token(self):
+        suffix = "Linear"
+        expected_tag = "linear"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_tree_token(self):
+        suffix = "Tree"
+        expected_tag = "tree"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_compv3_token(self):
+        suffix = "CompV3"
+        expected_tag = "compv3"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_pipelines_token(self):
+        suffix = "Pipelines"
+        expected_tag = "pipelines"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_unknown_token(self):
+        suffix = "unknown"
+        with self.assertRaises(ValueError):
+            suffix_to_file_tag(suffix)
+
+    def test_multiple_valid_tokens(self):
+        suffix = "Fp16PersistentAtomicCompV3"
+        expected_tag = "fp16_persistent_atomic_compv3"
+        self.assertEqual(suffix_to_file_tag(suffix), expected_tag)
+
+    def test_multiple_tokens_with_unknown(self):
+        suffix = "Fp16PersistentUnknownCompV3"
+        with self.assertRaises(ValueError):
+            suffix_to_file_tag(suffix)
+
+
+class TestParseTypesHeader(unittest.TestCase):
+    def validate_entries(self, entries, expected_entries):
+        self.assertEqual(len(entries), len(expected_entries))
+        for idx in range(len(entries)):
+            self.assertDictEqual(entries[idx], expected_entries[idx])
+
+    def test_empty_entry(self):
+        """Test that an empty file returns no entries."""
+        mock_content = ""
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "atomic_smoke")
+            self.assertEqual(len(entries), 0)
+
+    def test_pipelines_smoke(self):
+        """Test pipelines_smoke target: matches suffix == 'Pipelines'.
+        Includes: Pipelines
+        Excludes: Fp8NonPersistentTreeCompV3
+        """
+        mock_content = (
+            "using KernelTypesStreamKPipelines = ...\n"
+            "using KernelTypesStreamKFp8NonPersistentTreeCompV3 = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "pipelines_smoke")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKPipelines",
+                    "class_name": "TestCkTileStreamKPipelines",
+                    "file_tag": "pipelines",
+                }
+            ]
+            self.validate_entries(entries, expected)
+
+    def test_extended(self):
+        """Test extended target: matches 'Atomic' in suffix OR suffix == 'Pipelines'.
+        Includes: Fp16PersistentAtomic, Pipelines
+        Excludes: Bf16Linear
+        """
+        mock_content = (
+            "using KernelTypesStreamKFp16PersistentAtomic = ...\n"
+            "using KernelTypesStreamKPipelines = ...\n"
+            "using KernelTypesStreamKBf16Linear = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "extended")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKFp16PersistentAtomic",
+                    "class_name": "TestCkTileStreamKFp16PersistentAtomic",
+                    "file_tag": "fp16_persistent_atomic",
+                },
+                {
+                    "type_alias": "KernelTypesStreamKPipelines",
+                    "class_name": "TestCkTileStreamKPipelines",
+                    "file_tag": "pipelines",
+                },
+            ]
+            self.validate_entries(entries, expected)
+
+    def test_atomic_smoke(self):
+        """Test atomic_smoke target: matches 'Atomic' in suffix AND suffix != 'Pipelines'.
+        Includes: Fp16PersistentAtomic
+        Excludes: Bf16Linear, Pipelines
+        """
+        mock_content = (
+            "using KernelTypesStreamKFp16PersistentAtomic = ...\n"
+            "using KernelTypesStreamKBf16Linear = ...\n"
+            "using KernelTypesStreamKPipelines = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "atomic_smoke")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKFp16PersistentAtomic",
+                    "class_name": "TestCkTileStreamKFp16PersistentAtomic",
+                    "file_tag": "fp16_persistent_atomic",
+                }
+            ]
+            self.validate_entries(entries, expected)
+
+    def test_linear_smoke(self):
+        """Test linear_smoke target: matches 'Linear' in suffix AND suffix != 'Pipelines'.
+        Includes: Fp8NonPersistentLinear
+        Excludes: Bf16PersistentAtomic, Pipelines
+        """
+        mock_content = (
+            "using KernelTypesStreamKFp8NonPersistentLinear = ...\n"
+            "using KernelTypesStreamKBf16PersistentAtomic = ...\n"
+            "using KernelTypesStreamKPipelines = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "linear_smoke")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKFp8NonPersistentLinear",
+                    "class_name": "TestCkTileStreamKFp8NonPersistentLinear",
+                    "file_tag": "fp8_nonpersistent_linear",
+                }
+            ]
+            self.validate_entries(entries, expected)
+
+    def test_tree_smoke(self):
+        """Test tree_smoke target: matches 'Tree' in suffix AND suffix != 'Pipelines'.
+        Includes: Bf8PersistentTreeCompV3
+        Excludes: Fp16Linear, Pipelines
+        """
+        mock_content = (
+            "using KernelTypesStreamKBf8PersistentTreeCompV3 = ...\n"
+            "using KernelTypesStreamKFp16Linear = ...\n"
+            "using KernelTypesStreamKPipelines = ...\n"
+        )
+        with patch("builtins.open", mock_open(read_data=mock_content)):
+            entries = parse_types_header("fake_path.hpp", "tree_smoke")
+            expected = [
+                {
+                    "type_alias": "KernelTypesStreamKBf8PersistentTreeCompV3",
+                    "class_name": "TestCkTileStreamKBf8PersistentTreeCompV3",
+                    "file_tag": "bf8_persistent_tree_compv3",
+                }
+            ]
+            self.validate_entries(entries, expected)
+
+
+class TestOutputPath(unittest.TestCase):
+    def test_output_path(self):
+        """Test that output_path generates the correct file path."""
+        entry = {"file_tag": "fp16_persistent_atomic"}
+        output_dir = "/some/output/dir"
+        expected = "/some/output/dir/test_gemm_streamk_fp16_persistent_atomic.cpp"
+        self.assertEqual(output_path(output_dir, entry), expected)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ck_tile/gemm_streamk_tile_engine/CMakeLists.txt b/test/ck_tile/gemm_streamk_tile_engine/CMakeLists.txt
deleted file mode 100644
index 4acab26c41..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/CMakeLists.txt
+++ /dev/null
@@ -1,324 +0,0 @@
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-include(generate_configs.cmake) 
-
-# ============================================================================
-# GEMM Tile Engine Unit Tests
-# 
-# This CMake file creates unit tests for tile_engine generated GEMM kernels.
-# It follows the exact same build patterns as tile_engine for consistency
-# and reliability. Each kernel configuration gets its own test executable.
-# ============================================================================
-
-# Locate tile_engine GEMM scripts directory
-set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm_streamk")
-
-if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR})
-    message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}")
-    return()
-endif()
-
-# ============================================================================
-# create_individual_gemm_test_target
-#
-# Creates a single test executable for a specific kernel configuration.
-# Mirrors tile_engine's create_individual_gemm_target function for consistency.
-#
-# Parameters:
-#   datatype     - Data type (fp16, bf16, fp32, etc.)
-#   layout       - Matrix layout (rcr, rrr, ccr, crr)
-#   config_name  - Configuration file name without .json extension
-#   trait        - Kernel trait combination string
-#   tile_config  - Tile configuration parameters
-#   config_json  - Full path to JSON configuration file
-# ============================================================================
-function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json)
-    set(target_name "test_gemm_streamk_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
-    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
-    
-    # Generated header path (already created during cmake configuration)
-    set(test_header "${working_path}/gemm_streamk_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
-    set(test_params_header "${working_path}/test_params.hpp")
-    
-    # Verify header exists (should have been generated during cmake configuration)
-    if(NOT EXISTS ${test_header})
-        message(WARNING "Generated header not found: ${test_header}")
-        return()
-    endif()
-
-    # Verify test parameters header exists
-    if(NOT EXISTS ${test_params_header})
-        message(WARNING "Test parameters header not found: ${test_params_header}")
-        return()
-    endif()
-
-
-    # Create GTest executable for this kernel configuration
-    add_gtest_executable(${target_name}
-        ${CMAKE_CURRENT_SOURCE_DIR}/test_gemm_streamk_simple.cpp
-    )
-
-    # Configure GPU architectures for HIP compilation
-    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${GEMM_TEST_GPU_TARGETS})
-
-    # Define preprocessor macros for generated header location and test parameters
-    target_compile_definitions(${target_name} PRIVATE
-        GEMM_SINGLE_INSTANCE_HPP="${test_header}"
-        GEMM_TEST_PARAMS_HPP="${test_params_header}"
-    )
-
-    # Include directories for headers and dependencies
-    target_include_directories(${target_name} PRIVATE
-        ${PROJECT_SOURCE_DIR}/include
-        ${PROJECT_BINARY_DIR}/include
-        ${PROJECT_SOURCE_DIR}  # Root directory for tile_engine access
-        ${GTEST_INCLUDE_DIRS}
-    )
-
-    # Compiler options matching tile_engine requirements
-    target_compile_options(${target_name} PRIVATE
-        -Wno-undefined-func-template  # Suppress template warnings
-        -Wno-float-equal              # Allow floating point comparisons
-        --offload-compress            # Enable GPU code compression
-        -include ${test_header}       # Auto-include generated header
-    )
-
-    # Add FP8 format definitions for proper data type interpretation
-    if(CK_USE_OCP_FP8)
-        target_compile_options(${target_name} PRIVATE -DCK_TILE_USE_OCP_FP8)
-    endif()
-
-    message(DEBUG "  Created test target: ${target_name}")
-endfunction()
-
-# ============================================================================
-# build_gemm_test_targets
-#
-# Builds all test targets for a specific datatype/layout/config combination.
-# Uses tile_engine's two-step process: list kernels, then generate tests.
-#
-# Parameters:
-#   datatype     - Data type (fp16, bf16, fp32, etc.)
-#   layout       - Matrix layout (rcr, rrr, ccr, crr)
-#   config_name  - Configuration file name without .json extension
-# ============================================================================
-function(build_gemm_test_targets datatype layout config_name configs_dir_path)
-    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
-
-    # Locate and validate configuration file
-    set(config_filename "${config_name}.json")
-    set(json_blob "${configs_dir_path}/${config_filename}")
-
-    if(NOT EXISTS ${json_blob})
-        message(WARNING "Test config file not found: ${json_blob}")
-        return()
-    endif()
-
-    # Prepare build directory for this configuration
-    file(MAKE_DIRECTORY ${working_path})
-
-    # STEP 1: Discovery phase - list all valid kernel configurations
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_streamk_instance_builder.py
-                --working_path ${working_path}
-                --datatype ${datatype}
-                --layout ${layout}
-                --config_json ${json_blob}
-                --list_kernels
-                --gpu_targets "${SUPPORTED_GPU_TARGETS}"
-        WORKING_DIRECTORY ${TILE_ENGINE_GEMM_DIR}
-        RESULT_VARIABLE ret
-        OUTPUT_VARIABLE list_output
-        ERROR_VARIABLE list_error
-    )
-
-    if(NOT ret EQUAL 0)
-        message(WARNING "Failed to list kernels for ${datatype}_${layout}_${config_name}: ${list_error}")
-        return()
-    endif()
-
-    # Verify kernel list file was generated
-    if(NOT EXISTS ${working_path}/gemm_kernel_list.txt)
-        message(DEBUG "No kernels found for ${datatype}_${layout}_${config_name} (validation filtered out all combinations)")
-        return()
-    endif()
-
-    message(DEBUG "Building tests for ${datatype}_${layout}_${config_name}")
-
-    # STEP 2a: Extract test parameters from config
-    set(test_params_file "${working_path}/test_params.hpp")
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_SOURCE_DIR}/extract_test_params.py
-                --config_file ${json_blob}
-                --output_file ${test_params_file}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        RESULT_VARIABLE extract_ret
-        OUTPUT_VARIABLE extract_output
-        ERROR_VARIABLE extract_error
-    )
-
-    if(NOT extract_ret EQUAL 0)
-        message(WARNING "Failed to extract test parameters for ${datatype}_${layout}: ${extract_error}")
-        return()
-    endif()
-
-    # STEP 2b: Header generation phase - generate headers using --gen_single
-    message(STATUS "  Generating headers using --gen_single...")
-    
-    file(STRINGS ${working_path}/gemm_kernel_list.txt kernel_lines)
-    set(gen_count 0)
-    
-    foreach(line IN LISTS kernel_lines)
-                # Parse kernel specification format: kernel_name|tile_config|trait_combo
-                string(REPLACE "|" ";" parts "${line}")
-                list(LENGTH parts parts_len)
-                if(parts_len EQUAL 3)
-                    list(GET parts 0 kernel_name)
-                    list(GET parts 1 tile_config)
-                    list(GET parts 2 trait_combo)
-                    
-                    # Generate header using --gen_single
-                    execute_process(
-                        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_streamk_instance_builder.py
-                                --working_path ${working_path}
-                                --datatype ${datatype}
-                                --layout ${layout}
-                                --config_json ${json_blob}
-                                --gen_single
-                                --kernel_name "${kernel_name}"
-                                --tile_config "${tile_config}"
-                                --trait_combo "${trait_combo}"
-                                --gpu_targets "${SUPPORTED_GPU_TARGETS}"
-                        WORKING_DIRECTORY ${TILE_ENGINE_GEMM_DIR}
-                        RESULT_VARIABLE gen_ret
-                        OUTPUT_VARIABLE gen_output
-                        ERROR_VARIABLE gen_error
-                    )
-                    
-                    if(NOT gen_ret EQUAL 0)
-                        message(WARNING "Failed to generate header for ${kernel_name}: ${gen_error}")
-                    else()
-                    math(EXPR gen_count "${gen_count} + 1")
-                endif()
-            endif()
-        endforeach()
-        
-        message(STATUS "  Generated ${gen_count} headers for ${datatype}_${layout}")
-
-    # STEP 3: Target creation phase - create test targets
-    message(STATUS "  Creating test targets...")
-    file(STRINGS ${working_path}/gemm_kernel_list.txt kernel_lines)
-    set(test_count 0)
-    foreach(line IN LISTS kernel_lines)
-                # Parse kernel specification format: kernel_name|tile_config|trait_combo
-                string(REPLACE "|" ";" parts "${line}")
-                list(LENGTH parts parts_len)
-                if(parts_len EQUAL 3)
-                    list(GET parts 0 kernel_name)
-                    list(GET parts 1 tile_config)
-                    list(GET parts 2 trait_combo)
-
-                # Generate test target for this kernel configuration
-                create_individual_gemm_test_target("${datatype}" "${layout}" "${config_name}" "${trait_combo}" "${tile_config}" "${json_blob}")
-                math(EXPR test_count "${test_count} + 1")
-            endif()
-        endforeach()
-        message(STATUS "  Created ${test_count} test targets for ${datatype}_${layout}")
-endfunction()# ============================================================================
-# MAIN EXECUTION - Test Target Generation
-# ============================================================================
-
-message(STATUS "=== Starting StreamK GEMM Tile Engine Test Configuration ===")
-message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
-
-# GPU architecture filtering - only build tests for supported architectures
-set(GEMM_TEST_GPU_TARGETS "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
-
-foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
-    if(target IN_LIST DESIRED_TARGETS)
-        list(APPEND GEMM_TEST_GPU_TARGETS ${target})
-        message(STATUS "  Adding GPU target for tests: ${target}")
-    endif()
-endforeach()
-
-# Early exit if no compatible GPU architectures are available
-if(NOT GEMM_TEST_GPU_TARGETS)
-	message(WARNING "Skipping StreamK GEMM Tile Engine tests: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
-    return()
-endif()
-
-message(STATUS "Building StreamK GEMM tile engine tests for GPU targets: ${GEMM_TEST_GPU_TARGETS}")
-
-    # Enable parallel compilation optimizations
-    # Set up job pools for better parallel compilation control
-    set_property(GLOBAL PROPERTY JOB_POOLS
-        compile_heavy=4    # Limit heavy compilations to prevent OOM
-        compile_normal=16  # Allow more parallel normal compilations
-    )
-
-    # Enable compiler cache if available and explicitly requested
-    # Disabled by default due to permission issues in CI environments
-    option(ENABLE_CCACHE_TESTS "Enable ccache for test compilation" OFF)
-    if(ENABLE_CCACHE_TESTS)
-        find_program(CCACHE_PROGRAM ccache)
-        if(CCACHE_PROGRAM)
-            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-            message(STATUS "Using ccache for faster test compilation")
-        else()
-            message(WARNING "ccache requested but not found")
-        endif()
-    else()
-        message(STATUS "ccache disabled for tests (use -DENABLE_CCACHE_TESTS=ON to enable)")
-    endif()
-
-# ============================================================================
-# Test Configuration Matrix - Clean Focused Design
-# ============================================================================
-
-# All supported data types and layouts for comprehensive testing
-# Note: fp64 not included (no MFMA hardware support)
-set(TEST_DATATYPES "fp16;bf16")
-# Temporarily only test rcr and crr
-# set(TEST_LAYOUTS "rcr;rrr;ccr;crr")
-set(TEST_LAYOUTS "rcr;crr")
-
-# ============================================================================
-# Test Target Generation - Datatype-Specific Categories
-# ============================================================================
-
-# 1. SMOKE TESTS: Test for basic functionality with data types (fp8, bf8, fp16, bf16)
-# Temporarily only consider fp16
-# set(SMALL_DATATYPES "fp16;bf16;fp8;bf8")
-set(SMALL_DATATYPES "fp16")
-set(SIXTEEN_BIT_DATATYPES "fp16;bf16")
-set(EIGHT_BIT_DATATYPES "fp8;bf8")
-set(LARGE_TILES "256,256,32")
-set(SMALL_TILES "128,128,32")
-set(CONFIG_LIST "")
-set(GENERATED_CONFIG_PATH ${CMAKE_CURRENT_BINARY_DIR}/configs)
-get_cu_count(CU_COUNT)
-
-message(STATUS "Generating and processing configs for Stream-K tests")
-foreach(datatype IN LISTS SMALL_DATATYPES)
-
-    if(datatype IN_LIST SIXTEEN_BIT_DATATYPES)
-        generate_test_configs(${CU_COUNT} ${LARGE_TILES} ${datatype} CONFIG_LIST ${GENERATED_CONFIG_PATH})
-    else()
-        generate_test_configs(${CU_COUNT} ${SMALL_TILES} ${datatype} CONFIG_LIST ${GENERATED_CONFIG_PATH})
-    endif()
-
-    foreach(config IN LISTS CONFIG_LIST)
-    # testing all layouts (rcr, rrr, ccr, crr)
-        foreach(layout IN LISTS TEST_LAYOUTS)
-            build_gemm_test_targets("${datatype}" "${layout}" "${config}" "${GENERATED_CONFIG_PATH}")
-        endforeach()
-    endforeach()
-endforeach()
-
-# ============================================================================
-
-
-message(STATUS "StreamK GEMM tile engine tests configured with datatype-specific design:")
-message(STATUS "  - Smoke tests: fp16/bf16/fp8/bf8 (all layouts)")
diff --git a/test/ck_tile/gemm_streamk_tile_engine/README.md b/test/ck_tile/gemm_streamk_tile_engine/README.md
deleted file mode 100644
index 965342536b..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/README.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# Stream-K GEMM Tile Engine Unit Tests
-
-## How It Works
-
-This unit test system integrates **tile_engine's kernel generation** into automated testing:
-
-1. **Uses tile_engine scripts directly**: Same Python scripts that generate tile_engine kernels
-2. **JSON-based configuration**: Define test parameters in JSON files (like tile_engine)
-3. **Build-time generation**: CMake calls tile_engine scripts to generate kernel headers
-4. **Individual test executables**: Each kernel configuration becomes a separate test
-5. **Tile_engine verification**: Uses exact same error thresholds and validation as tile_engine
-
-## Tile Engine Integration
-
-```
-JSON Config → tile_engine Python scripts → Generated Headers → Test Executables
-```
-
-- **`--list_kernels`**: Get available kernel configurations from JSON
-- **`--gen_individual`**: Generate all kernel headers in parallel during CMake configuration
-- **`--gen_single`**: Generate individual kernel header for each configuration  
-- **Same verification**: Uses tile_engine's adaptive error thresholds and reference calculations
-- **Same patterns**: Follows tile_engine's tensor initialization, stride calculation, and kernel launching
-
-### Config-Specific Test Parameters
-
-Each test configuration can specify optimized problem sizes in its JSON file:
-- **`test_params.problem_sizes`**: Array of `{m, n, k, split_k}` configurations
-- **CMake extraction**: `extract_test_params.py` generates config-specific test parameter files
-- **Build integration**: Each test target uses parameters appropriate for its kernel configuration
-- **Optimized testing**: Different configs test different problem sizes that showcase their strengths
-
-
-The key idea: **Unit tests that use tile_engine's exact kernel generation and verification methodology** instead of creating separate test infrastructure.
-
-## Test Configurations
-Test configs are generated during the Generation Phase. They are stored under the build directory at test/ck_tile/gemm_streamk_tile_engine/configs. The Compute Unit (CU) count of the device is required to generate the configs. If the Generation Phase occurs on a machine without a GPU or does not contain same GPU architecture on which you will run the tests, you can manually set the CU count using the `CU_COUNT` option:
-```bash
-# Assuming you are at the root of the repo
-cd build
-../script/cmake-ck-dev.sh .. gfx90a  -G Ninja -DCU_COUNT=100
-```
-You can reference the public whitepaper for your specific GPU to get the appropriate CU count. 
-If no `CU_COUNT` option is given and no HIP device is found, then the default value of 100 CUs will be used to determine the problem sizes tested.
-
-### 1. **Smoke Tests**
-- **Purpose**: Basic functionality validation for fp16/bf16/fp8/bf8 data types
-- **Config**: 256x256x32 (for bf16/fp16) or 128x128x32 (for bf8/fp8), warp 2x2x1, warp_tile 32x32x16  
-- **Traits**: compv3 pipeline only
-- **Coverage**: All 4 layouts (rcr, rrr, ccr, crr)
-
-## Data Type Support
-- ✅ **fp16, bf16, fp8, bf8**: Fully supported - all layouts (rcr, rrr, ccr, crr)
-- ❌ **fp64**: Not supported (hardware MFMA limitation)
-- ⏳ **fp32, pk-int4-t**: Not yet supported by gemm_instance_builder (will be added later)
-
-## Test Result Behavior
-
-Tests automatically handle unsupported configurations through runtime validation:
-- **PASSED**: Kernel executed correctly with results within error thresholds ✅
-- **SKIPPED**: Kernel validation returned "Arguments not supported" (expected for certain problem sizes/configurations) ⚠️
-- **FAILED**: Actual error or incorrect computation results ❌
-
-When a kernel's `IsSupportedArgument()` check fails (e.g., due to vector alignment requirements, dimension constraints, or padding limitations), the test is automatically skipped rather than failed. This allows comprehensive testing across various problem sizes while gracefully handling configurations that don't meet specific kernel requirements.
diff --git a/test/ck_tile/gemm_streamk_tile_engine/cu_count.cpp b/test/ck_tile/gemm_streamk_tile_engine/cu_count.cpp
deleted file mode 100644
index 6e2857e8a1..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/cu_count.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include <hip/hip_runtime.h>
-#include <iostream>
-
-/**
- * @brief Determines whether a `hipError` is present in the given `error_status`
- * @return true if the `error_status` has an error, otherwise false.
- */
-bool has_error(const hipError_t& error_status)
-{
-    if(error_status != hipSuccess)
-    {
-        std::cerr << hipGetErrorString(error_status);
-        return true;
-    }
-
-    return false;
-}
-
-/**
- * @brief Returns the number of Compute Units (CUs) on the given device.
- * @return The number of CUs on the device. If an error occurs while querying the device, zero is
- * returned.
- */
-int get_cu_count()
-{
-    hipDevice_t dev;
-    hipDeviceProp_t dev_prop;
-
-    const hipError_t device_status = hipGetDevice(&dev);
-
-    if(has_error(device_status))
-        return 0;
-
-    const hipError_t prop_status = hipGetDeviceProperties(&dev_prop, dev);
-    if(has_error(prop_status))
-        return 0;
-
-    return dev_prop.multiProcessorCount;
-}
-
-int main()
-{
-
-    std::cout << get_cu_count();
-
-    return 0;
-}
diff --git a/test/ck_tile/gemm_streamk_tile_engine/extract_test_params.py b/test/ck_tile/gemm_streamk_tile_engine/extract_test_params.py
deleted file mode 100644
index 48ec8dba83..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/extract_test_params.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-
-import json
-import argparse
-import os
-from pathlib import Path
-
-
-def extract_test_params(config_file, output_file):
-    """Extract test parameters from config JSON and write to output file"""
-
-    # Read config file
-    with open(config_file, "r") as f:
-        config = json.load(f)
-
-    # Extract test parameters
-    test_params = []
-    if "test_params" in config and "problem_sizes" in config["test_params"]:
-        test_params = config["test_params"]["problem_sizes"]
-    else:
-        # Default test parameters if none specified
-        test_params = [
-            {"m": 256, "n": 256, "k": 128, "split_k": 1},
-            {"m": 256, "n": 256, "k": 1024, "split_k": 1},
-            {"m": 256, "n": 512, "k": 512, "split_k": 1},
-            {"m": 512, "n": 256, "k": 512, "split_k": 1},
-        ]
-
-    # Write to output file in C++ format
-    output_dir = Path(output_file).parent
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    with open(output_file, "w") as f:
-        f.write("// Generated test parameters for this configuration\n")
-        f.write("// This file is auto-generated during CMake configuration\n\n")
-        f.write("static const std::vector<GemmTestParams> CONFIG_TEST_PARAMS = {\n")
-
-        for i, params in enumerate(test_params):
-            comma = "," if i < len(test_params) - 1 else ""
-            f.write(
-                f"    {{{params['m']}, {params['n']}, {params['k']}, {params['split_k']}}}{comma}\n"
-            )
-
-        f.write("};\n")
-
-    print(
-        f"Extracted {len(test_params)} test parameters from {config_file} -> {output_file}"
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Extract test parameters from config JSON"
-    )
-    parser.add_argument("--config_file", required=True, help="Input config JSON file")
-    parser.add_argument(
-        "--output_file", required=True, help="Output test parameters file"
-    )
-
-    args = parser.parse_args()
-
-    if not os.path.exists(args.config_file):
-        print(f"Error: Config file not found: {args.config_file}")
-        return 1
-
-    extract_test_params(args.config_file, args.output_file)
-    return 0
-
-
-if __name__ == "__main__":
-    exit(main())
diff --git a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.cmake b/test/ck_tile/gemm_streamk_tile_engine/generate_configs.cmake
deleted file mode 100644
index 148d57976a..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.cmake
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-set(CU_COUNT 0 CACHE STRING "Number of Compute Units on the device")
-
-# ============================================================================
-# get_cu_count
-#
-# Returns the CU count for the device. If the given cu_count_arg is a positive
-# integer, then the nothing happens. Otherwise, we attempt to query the CU
-# count from the device. If the query is unsucessful, the default value of 100
-# is returned.
-#
-# Parameters:
-#   cu_count_arg  - The starting CU count
-# ============================================================================
-function(get_cu_count cu_count_arg)
-    message(STATUS "Starting query for CU count needed for Stream-K test config generation")
-
-    if(NOT "${${cu_count_arg}}" MATCHES "^[0-9]+$")
-        message(FATAL_ERROR "The CU count must be a non-negative integer. \
-                The given value of ${${cu_count_arg}} is invalid.")
-    endif()
-
-    if("${${cu_count_arg}}" STREQUAL "0")
-
-        set(CPP_FILE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cu_count.cpp)
-        set(CPP_EXE_PATH ${CMAKE_CURRENT_BINARY_DIR}/cu_count)
-
-        execute_process(
-            COMMAND ${CMAKE_HIP_COMPILER} -x hip ${CPP_FILE_PATH} -o ${CPP_EXE_PATH}
-            RESULT_VARIABLE compile_exit_code
-        )
-        
-        if (NOT compile_exit_code EQUAL 0)
-            message(FATAL_ERROR "Compilation of ${CPP_FILE_PATH} failed.\n")
-        endif()
-
-        # Get the HIP library directory
-        get_filename_component(HIP_COMPILER_DIR ${CMAKE_HIP_COMPILER} DIRECTORY)
-        get_filename_component(HIP_ROOT_DIR ${HIP_COMPILER_DIR} DIRECTORY)
-        set(HIP_LIB_DIR "${HIP_ROOT_DIR}/lib")
-
-        # Set library path for runtime execution
-        if(WIN32)
-            set(ENV{PATH} "${HIP_LIB_DIR};$ENV{PATH}")
-        else()
-            set(ENV{LD_LIBRARY_PATH} "${HIP_LIB_DIR}:$ENV{LD_LIBRARY_PATH}")
-        endif()
-
-        execute_process(
-            COMMAND ${CPP_EXE_PATH}
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            ERROR_VARIABLE standard_error
-            OUTPUT_VARIABLE queried_cu_count
-            RESULT_VARIABLE queried_cu_count_exit_code
-        )
-
-        if (standard_error)
-            message(STATUS "Error information from attempting to query HIP device and properties:\n"
-                            "${standard_error}")
-        endif()
-
-        if (NOT queried_cu_count_exit_code EQUAL 0)
-            message(STATUS "Failed to run ${CPP_EXE_PATH} to query the device's CU count")
-
-        endif()
-        
-
-        # Delete the generated cu_count executable
-        file(REMOVE "${CPP_EXE_PATH}")
-
-        if((queried_cu_count STREQUAL "0") OR (NOT queried_cu_count_exit_code EQUAL 0))
-            message(WARNING "Unable to query the number of Compute Units. \
-                    Please use the CU_COUNT CLI option to pass in the \
-                    number of Compute Units for your target device; otherwise, \
-                    the default value of 100 will be used.")
-            set(${cu_count_arg} 100 PARENT_SCOPE)
-        else()
-            set(${cu_count_arg} ${queried_cu_count} PARENT_SCOPE)
-        endif()
-
-    endif()
-
-endfunction()
-
-# ============================================================================
-# generate_test_configs
-#
-# Generate config json files for Stream-K tests
-#
-# Parameters:
-#   cu_count_arg  - The number of CUs on the device
-#   tile_sizes    - A list of block tile sizes: tile_m,tile_n,tile_k
-#   datatype      - The datatype for which the config is being generated
-#   config_list   - The variable to which the list of config file names are written
-#   configs_path   - Path to the configs directory to which config files are written
-# ============================================================================
-function(generate_test_configs cu_count_arg tile_sizes datatype config_list configs_path)
-    message(STATUS "Generating Stream-K test config files for ${datatype}")
-
-    file(MAKE_DIRECTORY ${configs_path})
-
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_SOURCE_DIR}/generate_configs.py 
-                --cu_count ${cu_count_arg} 
-                --configs_dir_path ${configs_path}
-                --tiles ${tile_sizes}
-                --datatype ${datatype}
-        OUTPUT_VARIABLE CONFIG_LIST
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE script_ret_val
-    )
-
-    if (NOT script_ret_val EQUAL 0)
-        message(FATAL_ERROR "Eror occured during execution of ${CMAKE_CURRENT_SOURCE_DIR}/generate_configs.py")
-    endif()
-
-    set(${config_list} ${CONFIG_LIST} PARENT_SCOPE)
-
-endfunction()
diff --git a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py b/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py
deleted file mode 100644
index 2795303684..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/generate_configs.py
+++ /dev/null
@@ -1,287 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-from enum import Enum
-from typing import Dict, Tuple, List
-import argparse
-import json
-import os
-import sys
-from dataclasses import dataclass, field, asdict
-
-
-@dataclass
-class TileConfig:
-    """Represents the Tile Config section of a Tile Engine config"""
-
-    tile_m: List[int] = field(default_factory=list)
-    tile_n: List[int] = field(default_factory=list)
-    tile_k: List[int] = field(default_factory=list)
-    warp_m: List[int] = field(default_factory=lambda: [2])
-    warp_n: List[int] = field(default_factory=lambda: [2])
-    warp_k: List[int] = field(default_factory=lambda: [1])
-    warp_tile_m: List[int] = field(default_factory=lambda: [16, 32])
-    warp_tile_n: List[int] = field(default_factory=lambda: [16, 32])
-    # Temporarily only consider 16 for warp_tile_k
-    # warp_tile_k: List[int] = field(default_factory=lambda: [8, 16, 32])
-    warp_tile_k: List[int] = field(default_factory=lambda: [16])
-
-    def to_dict(self) -> Dict:
-        return {k: {"values": v} for k, v in asdict(self).items()}
-
-
-@dataclass
-class TraitConfig:
-    """Represents the Trait Config section of a Tile Engine config"""
-
-    # Temporarily only consider compv3
-    # pipeline: List[str] = field(default_factory=lambda: ["compv3", "mem"])
-    pipeline: List[str] = field(default_factory=lambda: ["compv3"])
-    epilogue: List[str] = field(default_factory=lambda: ["cshuffle"])
-    scheduler: List[str] = field(default_factory=lambda: ["intrawave"])
-    pad_m: List[bool] = field(default_factory=lambda: [False])
-    pad_n: List[bool] = field(default_factory=lambda: [False])
-    pad_k: List[bool] = field(default_factory=lambda: [False])
-    persistent: List[bool] = field(default_factory=lambda: [True, False])
-    reduction_strategy: List[str] = field(default_factory=list)
-
-    def to_dict(self) -> Dict:
-        return {k: {"values": v} for k, v in asdict(self).items()}
-
-
-class TestVariant(Enum):
-    """Represents a Stream-K test variant"""
-
-    def __init__(
-        self,
-        val: int,
-        reduction_strategy: List[str],
-        persistent: List[bool],
-        datatypes: List[str],
-        description: str,
-    ):
-        self._value_ = val
-        self.reduction_strategy = reduction_strategy
-        self.persistent = persistent
-        self.datatypes = datatypes
-        self.description = description
-
-    ATOMIC_SMOKE = (
-        0,
-        ["atomic"],
-        [True, False],
-        # Temporarily only run fp16 tests
-        # ["fp16", "bf16", "fp8", "bf8"],
-        ["fp16"],
-        "Stream-K atomic smoke tests",
-    )
-    REDUCTION_SMOKE = (
-        2,
-        ["linear", "tree"],
-        [True, False],
-        # Temporarily only run fp16 tests
-        # ["fp16", "bf16", "fp8", "bf8"],
-        ["fp16"],
-        "Stream-K reduction smoke tests",
-    )
-    EXTENDED = (
-        3,
-        ["atomic"],
-        [True, False],
-        # Temporarily only run fp16 tests
-        # ["fp16", "bf16", "fp8", "bf8"],
-        ["fp16"],
-        "Stream-K extended smoke tests",
-    )
-
-    def apply(self, trait_config: TraitConfig) -> None:
-        """Applies the current test variant's persistent and reduction strategy setting to the given trait_config"""
-        trait_config.persistent = self.persistent
-        trait_config.reduction_strategy = self.reduction_strategy
-
-
-@dataclass
-class ProblemSize:
-    """Represents a problem size in a Tile Engine config"""
-
-    m: int
-    n: int
-    k: int
-    variant: TestVariant
-    split_k: int = 1
-
-    def to_dict(self) -> Dict:
-        return {"m": self.m, "n": self.n, "k": self.k, "split_k": self.split_k}
-
-
-@dataclass
-class Config:
-    """Represents a Tile Engine config"""
-
-    description: str
-    problem_sizes: list[ProblemSize] = field(default_factory=list)
-    tile_config: TileConfig = field(default_factory=TileConfig)
-    trait_config: TraitConfig = field(default_factory=TraitConfig)
-    k_block_per_cu: int = 1
-    permute_n: bool = False
-
-    def add_problem_size(self, problem: ProblemSize) -> None:
-        """Adds the given problem to this config's problem_sizes"""
-        self.problem_sizes.append(problem)
-
-    def to_dict(self) -> Dict:
-        config_dict = {
-            "problem": {"description": f"{self.description}"},
-            "test_params": {
-                "problem_sizes": [ps.to_dict() for ps in self.problem_sizes]
-            },
-            "tile_config": self.tile_config.to_dict(),
-            "trait_config": self.trait_config.to_dict(),
-            "k_block_per_cu": self.k_block_per_cu,
-            "permute_n": self.permute_n,
-        }
-        return config_dict
-
-    def write_to_file(self, output_file: str) -> None:
-        """Writes this configs to the given output_file in a json format"""
-        with open(output_file, "w") as config_file:
-            json.dump(self.to_dict(), config_file, indent=4)
-            config_file.write("\n")
-
-
-def create_problem_sizes(
-    tile_m: int, tile_n: int, tile_k: int, cu_count: int
-) -> List[ProblemSize]:
-    """Creates and returns a list of problem sizes using the given arguments"""
-    problem_sizes = [
-        ProblemSize(256, 256, 256, TestVariant.ATOMIC_SMOKE),
-        ProblemSize(tile_m * cu_count, tile_n, tile_k, TestVariant.ATOMIC_SMOKE),
-        ProblemSize(
-            tile_m * 2, tile_n * 2, cu_count * tile_k, TestVariant.ATOMIC_SMOKE
-        ),
-        ProblemSize(tile_m, tile_n, cu_count * tile_k, TestVariant.REDUCTION_SMOKE),
-        ProblemSize(
-            tile_m * 4,
-            tile_n,
-            tile_k * cu_count + (25 * tile_k),
-            TestVariant.REDUCTION_SMOKE,
-        ),
-        ProblemSize(
-            tile_m * 3,
-            tile_n * 7,
-            tile_k * cu_count + (30 * tile_k),
-            TestVariant.REDUCTION_SMOKE,
-        ),
-        # TODO: Add this test once we determine how to label tests as regresion with tile engine
-        # ProblemSize((tile_m * cu_count * 2) + (tile_m * 2), tile_n, 2048, TestVariant.EXTENDED)
-    ]
-
-    return problem_sizes
-
-
-def write_config_files(
-    problem_sizes: List[ProblemSize],
-    configs_dir_path: str,
-    datatype: str,
-    tile_sizes: Tuple[int, int, int],
-) -> str:
-    """Writes the given problem_sizes to a config file and returns the names of the config files written to"""
-    config_names = []
-    tile_m, tile_n, tile_k = tile_sizes
-    tile_config = TileConfig([tile_m], [tile_n], [tile_k])
-
-    # Create a config for each test variant
-    for variant in TestVariant:
-        problem_sizes_filtered = [ps for ps in problem_sizes if ps.variant == variant]
-
-        if (datatype not in variant.datatypes) or len(problem_sizes_filtered) == 0:
-            continue
-
-        trait_config = TraitConfig()
-        variant.apply(trait_config)
-        config_name = f"streamk_{variant.name.lower()}_tests_config_{datatype}"
-        config_names.append(config_name)
-        file_path = os.path.join(configs_dir_path, config_name + ".json")
-        config = Config(
-            variant.description, problem_sizes_filtered, tile_config, trait_config
-        )
-        config.write_to_file(file_path)
-
-    return config_names
-
-
-def print_config_names(config_file_names: List[str]) -> None:
-    """Prints given config file names as a single semi-colon separated string"""
-    print(";".join(config_file_names))
-
-
-def create_config_files(
-    cu_count: int, configs_dir_path: str, tile_sizes: int, datatype: str
-) -> None:
-    """Creates Stream-K test config files and prints the file names in a semi-colon-separated list"""
-    tile_m, tile_n, tile_k = tile_sizes
-
-    problem_sizes = create_problem_sizes(tile_m, tile_n, tile_k, cu_count)
-    config_names = write_config_files(
-        problem_sizes, configs_dir_path, datatype, tile_sizes
-    )
-    print_config_names(config_names)
-
-
-def get_args() -> Tuple[int, str, Tuple[int, int, int], str]:
-    """Returns user provided arguments"""
-
-    def tile_sizes_type(val: str):
-        sizes = None
-        parts = val.split(",")
-        if len(parts) != 3:
-            raise argparse.ArgumentTypeError(
-                "--tiles must contain exactly three comma-separated values (m,n,k), e.g. --tiles 256,256,32"
-            )
-        try:
-            sizes = tuple(int(size) for size in parts)
-        except ValueError:
-            raise argparse.ArgumentTypeError(
-                "--tiles must contain exactly three comma-separated integers (m,n,k), e.g. --tiles 256,256,32"
-            )
-
-        return sizes
-
-    parser = argparse.ArgumentParser(description="Create Stream-K test configs")
-    parser.add_argument(
-        "--cu_count", required=True, help="Number of Compute Units on the device"
-    )
-    parser.add_argument(
-        "--configs_dir_path",
-        required=True,
-        help="Full path configs directory where config files will be written to",
-    )
-
-    parser.add_argument(
-        "--tiles",
-        required=True,
-        type=tile_sizes_type,
-        help="Block tile sizes for m, n, and k, respectively. Ex: --tiles 256,256,32",
-    )
-
-    parser.add_argument(
-        "--datatype",
-        choices=["fp16", "bf16", "fp8", "bf8"],
-        required=True,
-        help="The datatype for which the config is generated.",
-    )
-
-    args = parser.parse_args()
-
-    return (int(args.cu_count), args.configs_dir_path, args.tiles, args.datatype)
-
-
-def main():
-    cu_count, configs_dir_path, tile_sizes, datatype = get_args()
-    create_config_files(cu_count, configs_dir_path, tile_sizes, datatype)
-    sys.exit(0)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/ck_tile/gemm_streamk_tile_engine/test_gemm_streamk_simple.cpp b/test/ck_tile/gemm_streamk_tile_engine/test_gemm_streamk_simple.cpp
deleted file mode 100644
index 284feb477d..0000000000
--- a/test/ck_tile/gemm_streamk_tile_engine/test_gemm_streamk_simple.cpp
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-/**
- * @file test_gemm_simple.cpp
- * @brief Unit tests for GEMM kernels generated by gemm_instance_builder
- *
- * This test includes kernels generated during CMake configuration by
- * gemm_instance_builder.py and tests them with problem sizes extracted
- * from the corresponding JSON configuration files.
- */
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <tuple>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "tile_engine/ops/gemm_streamk/gemm_streamk_common.hpp"
-
-// The kernel header is included via compile command line with -include flag
-// It defines SelectedKernel struct, KERNEL_NAME, and tensor data types
-
-// Adaptive error threshold calculation matching tile_engine's implementation
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-/// @brief Function to compare the results of the device and host computations (from tile_engine)
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-bool compare_results(std::string instanceName,
-                     ck_tile::index_t K,
-                     ck_tile::index_t kbatch,
-                     ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                     ck_tile::HostTensor<CDataType>& c_m_n_host_result)
-{
-    const float max_accumulated_value =
-        std::abs(static_cast<float>(*std::max_element(c_m_n_host_result.mData.begin(),
-                                                      c_m_n_host_result.mData.end(),
-                                                      [](CDataType a, CDataType b) {
-                                                          return std::abs(static_cast<float>(a)) <
-                                                                 std::abs(static_cast<float>(b));
-                                                      })));
-    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-        K, kbatch, max_accumulated_value);
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_host_result,
-                                   "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
-
-    std::cout << "For " << instanceName << " Relative error threshold is "
-              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
-              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
-
-    return pass;
-}
-
-// Test parameter structure for matrix dimensions and split_k values
-struct GemmTestParams
-{
-    int m, n, k, split_k;
-};
-
-// Include config-specific test parameters (after GemmTestParams struct is defined)
-#ifdef GEMM_TEST_PARAMS_HPP
-#include GEMM_TEST_PARAMS_HPP
-#endif
-
-class StreamKGemmTileEngineTest : public ::testing::TestWithParam<GemmTestParams>
-{
-    protected:
-    void SetUp() override
-    {
-        auto params = GetParam();
-        m_          = params.m;
-        n_          = params.n;
-        k_          = params.k;
-        split_k_    = params.split_k;
-
-        // Calculate strides (following tile_engine pattern)
-        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
-        {
-            stride_a_ = k_;
-        }
-        else
-        {
-            stride_a_ = m_;
-        }
-
-        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-        {
-            stride_b_ = n_;
-        }
-        else
-        {
-            stride_b_ = k_;
-        }
-
-        if constexpr(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-        {
-            stride_c_ = n_;
-        }
-        else
-        {
-            stride_c_ = m_;
-        }
-    }
-
-    // Test dimensions
-    int m_, n_, k_, split_k_;
-    int stride_a_, stride_b_, stride_c_;
-};
-
-TEST_P(StreamKGemmTileEngineTest, BasicFunctionality)
-{
-    // Check that kernel information is available
-    EXPECT_TRUE(strlen(KERNEL_NAME) > 0) << "Kernel name should not be empty";
-
-    std::cout << "Testing kernel: " << KERNEL_NAME << std::endl;
-    std::cout << "Problem size: " << m_ << "x" << n_ << "x" << k_ << std::endl;
-
-    // Get tensor layouts from generated kernel
-    const ALayout layout_a = ALayout{};
-    const BLayout layout_b = BLayout{};
-    const CLayout layout_c = CLayout{};
-
-    // Calculate tensor strides
-    int stride_a_calc = ck_tile::get_default_stride(m_, k_, 0, is_row_major(layout_a));
-    int stride_b_calc = ck_tile::get_default_stride(k_, n_, 0, is_row_major(layout_b));
-    int stride_c_calc = ck_tile::get_default_stride(m_, n_, 0, is_row_major(layout_c));
-
-    // Create host tensors with proper descriptors
-    ck_tile::HostTensor<ADataType> a_m_k(
-        ck_tile::host_tensor_descriptor(m_, k_, stride_a_calc, is_row_major(layout_a)));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(k_, n_, stride_b_calc, is_row_major(layout_b)));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-        ck_tile::host_tensor_descriptor(m_, n_, stride_c_calc, is_row_major(layout_c)));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_ref(
-        ck_tile::host_tensor_descriptor(m_, n_, stride_c_calc, is_row_major(layout_c)));
-
-    // Initialize input tensors with uniform random distribution [-1.0, 1.0] (matches tile_engine)
-    ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-    ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
-    c_m_n_dev_ref.SetZero();
-
-    // Allocate GPU device memory
-    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem ref_c_m_n_dev_buf(c_m_n_dev_ref.get_element_space_size_in_bytes());
-
-    // Copy data to device and zero output buffer
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    b_k_n_dev_buf.ToDevice(b_k_n.data());
-    c_m_n_dev_buf.SetZero();
-    ref_c_m_n_dev_buf.SetZero();
-
-    // Calculate reference result on device for verification
-    ADataType* a_m_k_dev_ref_ptr = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
-    BDataType* b_k_n_dev_ref_ptr = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
-    CDataType* c_m_n_dev_ref_ptr = static_cast<CDataType*>(ref_c_m_n_dev_buf.GetDeviceBuffer());
-    ck_tile::
-        reference_gemm_gpu<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            a_m_k_dev_ref_ptr,
-            b_k_n_dev_ref_ptr,
-            c_m_n_dev_ref_ptr,
-            m_,
-            n_,
-            k_,
-            stride_a_calc,
-            stride_b_calc,
-            stride_c_calc);
-    ref_c_m_n_dev_buf.FromDevice(c_m_n_dev_ref.data());
-
-    // Create GEMM kernel arguments
-    ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
-                                  b_k_n_dev_buf.GetDeviceBuffer(),
-                                  c_m_n_dev_buf.GetDeviceBuffer(),
-                                  m_,
-                                  n_,
-                                  k_,
-                                  stride_a_calc,
-                                  stride_b_calc,
-                                  stride_c_calc};
-
-    // Configure kernel execution for maximum speed (no timing, no debug output)
-    ck_tile::stream_config stream_config{nullptr, // stream
-                                         false,   // time_kernel (disable timing for speed)
-                                         0,       // log_level (disable debug output)
-                                         0,       // n_warmup
-                                         1,       // n_repeat
-                                         false,   // is_gpu_timer (unused when time_kernel=false)
-                                         false,   // flush_cache
-                                         1};      // rotating_count
-
-    // Launch the generated kernel (no timing overhead for fastest execution)
-    std::tuple<float, ck_tile::index_t> launch_result;
-    try
-    {
-        launch_result = SelectedKernel::launch(args, stream_config);
-        // Kernel launched successfully if no exception thrown
-    }
-    catch(const std::exception& e)
-    {
-        std::string error_msg(e.what());
-        // If arguments not supported, skip the test (configuration validation failure, not a bug)
-        if(error_msg.find("Arguments not supported") != std::string::npos)
-        {
-            GTEST_SKIP() << "Configuration not supported: " << e.what();
-        }
-        else
-        {
-            FAIL() << "Kernel launch failed: " << e.what();
-        }
-    }
-
-    // Copy result back from device
-    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-
-    // Verify results using tile_engine's adaptive error thresholds
-    const ck_tile::index_t num_wgs_per_tile = get<1>(launch_result);
-    bool verification_passed = compare_results<ADataType, BDataType, AccDataType, CDataType>(
-        KERNEL_NAME, k_, num_wgs_per_tile, c_m_n_dev_result, c_m_n_dev_ref);
-
-    EXPECT_TRUE(verification_passed) << "GEMM result verification failed";
-}
-
-// Use config-specific test parameters (included via compile flags)
-// CONFIG_TEST_PARAMS is defined in the auto-generated test_params.hpp file
-INSTANTIATE_TEST_SUITE_P(GemmVerification,
-                         StreamKGemmTileEngineTest,
-                         ::testing::ValuesIn(CONFIG_TEST_PARAMS),
-                         [](const ::testing::TestParamInfo<GemmTestParams>& param_info) {
-                             return std::to_string(param_info.param.m) + "x" +
-                                    std::to_string(param_info.param.n) + "x" +
-                                    std::to_string(param_info.param.k) + "_splitk" +
-                                    std::to_string(param_info.param.split_k);
-                         });
diff --git a/test/ck_tile/gemm_tile_engine/CMakeLists.txt b/test/ck_tile/gemm_tile_engine/CMakeLists.txt
index 33effcc120..4cecba0e8a 100644
--- a/test/ck_tile/gemm_tile_engine/CMakeLists.txt
+++ b/test/ck_tile/gemm_tile_engine/CMakeLists.txt
@@ -232,7 +232,7 @@ message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # GPU architecture filtering - only build tests for supported architectures
 set(GEMM_TEST_GPU_TARGETS "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -243,7 +243,7 @@ endforeach()
 
 # Early exit if no compatible GPU architectures are available
 if(NOT GEMM_TEST_GPU_TARGETS)
-    message(WARNING "Skipping GEMM Tile Engine tests: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping GEMM Tile Engine tests: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
     return()
 endif()
 
diff --git a/test/ck_tile/pooling_tile_engine/CMakeLists.txt b/test/ck_tile/pooling_tile_engine/CMakeLists.txt
new file mode 100644
index 0000000000..d41539cb9f
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/CMakeLists.txt
@@ -0,0 +1,341 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# ============================================================================
+# Pooling Tile Engine Unit Tests
+#
+# This CMake file creates unit tests for tile_engine generated pooling kernels.
+# Each kernel configuration gets its own test executable.
+# ============================================================================
+
+# Locate tile_engine pooling scripts directory
+set(TILE_ENGINE_POOLING_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/pooling")
+
+if(NOT EXISTS ${TILE_ENGINE_POOLING_DIR})
+    message(WARNING "Tile engine pooling directory not found: ${TILE_ENGINE_POOLING_DIR}")
+    return()
+endif()
+
+# ============================================================================
+# create_individual_pool_test_target
+#
+# Creates a single test executable for a specific pooling kernel configuration.
+#
+# Parameters:
+#   datatype     - Data type (fp16, fp32, bf16)
+#   config_name  - Configuration file name without .json extension
+#   trait        - Kernel trait combination string
+#   tile_config  - Tile configuration parameters
+#   config_json  - Full path to JSON configuration file
+# ============================================================================
+function(create_individual_pool_test_target datatype config_name kernel_name trait tile_config config_json)
+    set(target_name "test_pooling_tile_engine_${datatype}_${config_name}_${trait}_${tile_config}")
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${config_name}")
+
+    # Generated header path (already created during cmake configuration)
+    # Use kernel_name from pool_kernel_list.txt to match the filename generated by pooling_instance_builder.py
+    set(test_header "${working_path}/pooling_single_${kernel_name}.hpp")
+
+    # Determine pooling dimension from trait string (format: reduce_op_output_index_propagate_nan_pooling_dim)
+    # The pooling_dim is the last field: "2d" or "3d"
+    string(REGEX MATCH "[23]d$" kernel_pooling_dim "${trait}")
+    if(kernel_pooling_dim STREQUAL "3d")
+        set(test_params_header "${working_path}/test_params_3d.hpp")
+        set(pooling_dim_value 3)
+    else()
+        set(test_params_header "${working_path}/test_params_2d.hpp")
+        set(pooling_dim_value 2)
+    endif()
+
+    # Verify header exists
+    if(NOT EXISTS ${test_header})
+        message(WARNING "Generated header not found: ${test_header}")
+        return()
+    endif()
+
+    # Verify test parameters header exists
+    if(NOT EXISTS ${test_params_header})
+        message(WARNING "Test parameters header not found: ${test_params_header}")
+        return()
+    endif()
+
+    # Create GTest executable for this kernel configuration
+    add_gtest_executable(${target_name}
+        ${CMAKE_CURRENT_SOURCE_DIR}/test_pooling_simple.cpp
+    )
+
+    # Configure GPU architectures for HIP compilation
+    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${POOLING_TEST_GPU_TARGETS})
+
+    # Define preprocessor macros for generated header location, test parameters, and pooling dimension
+    target_compile_definitions(${target_name} PRIVATE
+        POOLING_SINGLE_INSTANCE_HPP="${test_header}"
+        POOLING_TEST_PARAMS_HPP="${test_params_header}"
+        POOLING_DIM_VALUE=${pooling_dim_value}
+    )
+
+    # Include directories for headers and dependencies
+    target_include_directories(${target_name} PRIVATE
+        ${PROJECT_SOURCE_DIR}/include
+        ${PROJECT_BINARY_DIR}/include
+        ${PROJECT_SOURCE_DIR}  # Root directory for tile_engine access
+        ${GTEST_INCLUDE_DIRS}
+    )
+
+    # Compiler options matching tile_engine requirements
+    target_compile_options(${target_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+        -include ${test_header}
+    )
+
+    # Add FP8 format definitions for proper data type interpretation
+    if(CK_USE_OCP_FP8)
+        target_compile_options(${target_name} PRIVATE -DCK_TILE_USE_OCP_FP8)
+    endif()
+
+    message(STATUS "  Created test target: ${target_name}")
+endfunction()
+
+# ============================================================================
+# build_pool_test_targets
+#
+# Builds all test targets for a specific datatype/config combination.
+# Uses tile_engine's two-step process: list kernels, then generate tests.
+#
+# Parameters:
+#   datatype     - Data type (fp16, fp32, bf16)
+#   config_name  - Configuration file name without .json extension
+# ============================================================================
+function(build_pool_test_targets datatype config_name)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${config_name}")
+
+    # Locate and validate configuration file
+    set(config_filename "${config_name}.json")
+    set(json_blob "${CMAKE_CURRENT_SOURCE_DIR}/configs/${config_filename}")
+
+    if(NOT EXISTS ${json_blob})
+        message(WARNING "Test config file not found: ${json_blob}")
+        return()
+    endif()
+
+    # Prepare build directory for this configuration
+    file(MAKE_DIRECTORY ${working_path})
+
+    # STEP 1: Discovery phase - list all valid kernel configurations
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_POOLING_DIR}/pooling_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --config_json ${json_blob}
+                --list_kernels
+        WORKING_DIRECTORY ${TILE_ENGINE_POOLING_DIR}
+        RESULT_VARIABLE ret
+        OUTPUT_VARIABLE list_output
+        ERROR_VARIABLE list_error
+    )
+
+    if(NOT ret EQUAL 0)
+        message(WARNING "Failed to list pooling kernels for ${datatype}_${config_name}: ${list_error}")
+        return()
+    endif()
+
+    # Verify kernel list file was generated
+    if(NOT EXISTS ${working_path}/pool_kernel_list.txt)
+        message(STATUS "No pooling kernels found for ${datatype}_${config_name}")
+        return()
+    endif()
+
+    message(STATUS "Building pooling tests for ${datatype}_${config_name}")
+
+    # STEP 2a: Extract test parameters from config for BOTH 2D and 3D dimensions.
+    # Each kernel's pooling_dim is embedded in its trait string, so we generate
+    # separate test_params headers and select the right one per kernel target.
+    set(test_params_file_2d "${working_path}/test_params_2d.hpp")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_SOURCE_DIR}/extract_test_params.py
+                --config_file ${json_blob}
+                --output_file ${test_params_file_2d}
+                --pooling_dim 2d
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        RESULT_VARIABLE extract_ret_2d
+        OUTPUT_VARIABLE extract_output_2d
+        ERROR_VARIABLE extract_error_2d
+    )
+    if(NOT extract_ret_2d EQUAL 0)
+        message(WARNING "Failed to extract 2D test parameters for pooling ${datatype}: ${extract_error_2d}")
+        return()
+    endif()
+
+    set(test_params_file_3d "${working_path}/test_params_3d.hpp")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_SOURCE_DIR}/extract_test_params.py
+                --config_file ${json_blob}
+                --output_file ${test_params_file_3d}
+                --pooling_dim 3d
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        RESULT_VARIABLE extract_ret_3d
+        OUTPUT_VARIABLE extract_output_3d
+        ERROR_VARIABLE extract_error_3d
+    )
+    if(NOT extract_ret_3d EQUAL 0)
+        message(WARNING "Failed to extract 3D test parameters for pooling ${datatype}: ${extract_error_3d}")
+        return()
+    endif()
+
+    # STEP 2c: Header generation phase - generate headers using --gen_single
+    message(STATUS "  Generating pooling headers using --gen_single...")
+
+    file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines)
+    set(gen_count 0)
+
+    foreach(line IN LISTS kernel_lines)
+        # Parse kernel specification format: kernel_name|tile_config|trait_combo
+        string(REPLACE "|" ";" parts "${line}")
+        list(LENGTH parts parts_len)
+        if(parts_len EQUAL 3)
+            list(GET parts 0 kernel_name)
+            list(GET parts 1 tile_config)
+            list(GET parts 2 trait_combo)
+
+            # Generate header using --gen_single
+            execute_process(
+                COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_POOLING_DIR}/pooling_instance_builder.py
+                        --working_path ${working_path}
+                        --datatype ${datatype}
+                        --config_json ${json_blob}
+                        --gen_single
+                        --kernel_name "${kernel_name}"
+                        --tile_config "${tile_config}"
+                        --trait_combo "${trait_combo}"
+                WORKING_DIRECTORY ${TILE_ENGINE_POOLING_DIR}
+                RESULT_VARIABLE gen_ret
+                OUTPUT_VARIABLE gen_output
+                ERROR_VARIABLE gen_error
+            )
+
+            if(NOT gen_ret EQUAL 0)
+                message(WARNING "Failed to generate pooling header for ${kernel_name}: ${gen_error}")
+            else()
+                math(EXPR gen_count "${gen_count} + 1")
+            endif()
+        endif()
+    endforeach()
+
+    message(STATUS "  Generated ${gen_count} pooling headers for ${datatype}")
+
+    # STEP 3: Target creation phase - create test targets
+    message(STATUS "  Creating pooling test targets...")
+    file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines)
+    set(test_count 0)
+    foreach(line IN LISTS kernel_lines)
+        string(REPLACE "|" ";" parts "${line}")
+        list(LENGTH parts parts_len)
+        if(parts_len EQUAL 3)
+            list(GET parts 0 kernel_name)
+            list(GET parts 1 tile_config)
+            list(GET parts 2 trait_combo)
+
+            create_individual_pool_test_target("${datatype}" "${config_name}" "${kernel_name}" "${trait_combo}" "${tile_config}" "${json_blob}")
+            math(EXPR test_count "${test_count} + 1")
+        endif()
+    endforeach()
+    message(STATUS "  Created ${test_count} pooling test targets for ${datatype}")
+endfunction()
+
+# ============================================================================
+# MAIN EXECUTION - Test Target Generation
+# ============================================================================
+
+message(STATUS "=== Starting Pooling Tile Engine Test Configuration ===")
+message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+
+# GPU architecture filtering - only build tests for supported architectures
+set(POOLING_TEST_GPU_TARGETS "")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+
+foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+    if(target IN_LIST DESIRED_TARGETS)
+        list(APPEND POOLING_TEST_GPU_TARGETS ${target})
+        message(STATUS "  Adding GPU target for pooling tests: ${target}")
+    endif()
+endforeach()
+
+# Early exit if no compatible GPU architectures are available
+if(NOT POOLING_TEST_GPU_TARGETS)
+    message(WARNING "Skipping Pooling Tile Engine tests: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    return()
+endif()
+
+message(STATUS "Building Pooling tile engine tests for GPU targets: ${POOLING_TEST_GPU_TARGETS}")
+
+# Enable parallel compilation optimizations
+set_property(GLOBAL PROPERTY JOB_POOLS
+    compile_heavy=4
+    compile_normal=16
+)
+
+# Enable compiler cache if available and explicitly requested
+option(ENABLE_CCACHE_TESTS "Enable ccache for test compilation" OFF)
+if(ENABLE_CCACHE_TESTS)
+    find_program(CCACHE_PROGRAM ccache)
+    if(CCACHE_PROGRAM)
+        set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+        message(STATUS "Using ccache for faster test compilation")
+    else()
+        message(WARNING "ccache requested but not found")
+    endif()
+else()
+    message(STATUS "ccache disabled for tests (use -DENABLE_CCACHE_TESTS=ON to enable)")
+endif()
+
+# ============================================================================
+# Test Configuration Matrix
+# ============================================================================
+
+set(TEST_DATATYPES "fp16;fp32")
+
+# ============================================================================
+# Test Target Generation
+# ============================================================================
+
+# 1. SIMPLE TEST: Basic functionality validation (always built)
+set(SIMPLE_TEST_CONFIG "simple_test_config")
+set(SIMPLE_TEST_CONFIG_FILE "${CMAKE_CURRENT_SOURCE_DIR}/configs/${SIMPLE_TEST_CONFIG}.json")
+
+if(EXISTS ${SIMPLE_TEST_CONFIG_FILE})
+    message(STATUS "Processing pooling simple test config: ${SIMPLE_TEST_CONFIG}")
+    foreach(datatype IN LISTS TEST_DATATYPES)
+        build_pool_test_targets("${datatype}" "${SIMPLE_TEST_CONFIG}")
+    endforeach()
+else()
+    message(WARNING "Pooling simple test config file not found: ${SIMPLE_TEST_CONFIG_FILE}")
+endif()
+
+# 2. COVERAGE LEVEL: Quick or comprehensive testing
+#    Quick: ~2 kernels (1 tile config × 1 trait combo × fp16/fp32) from simple config only
+#    Comprehensive: ~200+ kernels with extensive tile sizes, warp configurations, and all trait combinations
+set(POOLING_COVERAGE_LEVEL "quick" CACHE STRING "Pooling coverage level: quick or comprehensive")
+set_property(CACHE POOLING_COVERAGE_LEVEL PROPERTY STRINGS "quick" "comprehensive")
+
+if(POOLING_COVERAGE_LEVEL STREQUAL "comprehensive")
+    set(COMPREHENSIVE_CONFIG "comprehensive_coverage_config")
+    set(COMPREHENSIVE_CONFIG_FILE "${CMAKE_CURRENT_SOURCE_DIR}/configs/${COMPREHENSIVE_CONFIG}.json")
+
+    if(EXISTS ${COMPREHENSIVE_CONFIG_FILE})
+        message(STATUS "Processing pooling comprehensive coverage config: ${COMPREHENSIVE_CONFIG}")
+        foreach(datatype IN LISTS TEST_DATATYPES)
+            build_pool_test_targets("${datatype}" "${COMPREHENSIVE_CONFIG}")
+        endforeach()
+    else()
+        message(WARNING "Pooling comprehensive config file not found: ${COMPREHENSIVE_CONFIG_FILE}")
+    endif()
+elseif(NOT POOLING_COVERAGE_LEVEL STREQUAL "quick")
+    message(FATAL_ERROR "Invalid POOLING_COVERAGE_LEVEL: ${POOLING_COVERAGE_LEVEL}. Must be 'quick' or 'comprehensive'")
+endif()
+
+message(STATUS "Pooling tile engine tests configured:")
+message(STATUS "  - Simple test: fp16/fp32 (always)")
+message(STATUS "  - Coverage level: ${POOLING_COVERAGE_LEVEL}")
+message(STATUS "    Use -DPOOLING_COVERAGE_LEVEL=comprehensive for extensive testing")
diff --git a/test/ck_tile/pooling_tile_engine/README.md b/test/ck_tile/pooling_tile_engine/README.md
new file mode 100644
index 0000000000..8364affbde
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/README.md
@@ -0,0 +1,87 @@
+# Pooling Tile Engine Tests
+
+Unit tests for pooling kernels generated by the tile_engine pooling codegen system.
+
+## Overview
+
+These tests validate pooling kernels that are generated at CMake configuration time
+by `pooling_instance_builder.py`. Each kernel configuration (tile shape + traits)
+gets its own GTest executable that verifies correctness against a CPU reference
+implementation.
+
+## Architecture
+
+
+```
+test/ck_tile/pooling_tile_engine/
+├── CMakeLists.txt                # Build infrastructure
+├── configs/
+│   └── simple_test_config.json   # Test configuration with problem sizes
+├── extract_test_params.py        # Extracts problem sizes to C++ header
+├── test_pooling_simple.cpp       # GTest driver (parameterized)
+└── README.md                     # This file
+```
+
+### Build Flow
+
+1. **CMake configuration**: `CMakeLists.txt` invokes `pooling_instance_builder.py --list_kernels`
+   to discover valid kernel configurations from the JSON config.
+2. **Parameter extraction**: `extract_test_params.py` generates `test_params.hpp` with
+   problem sizes from the JSON config.
+3. **Header generation**: For each kernel, `pooling_instance_builder.py --gen_single`
+   generates a C++ header defining `SelectedKernel` with the specific tile configuration.
+4. **Compilation**: Each kernel gets a separate test executable compiled with the
+   generated header via `-include`.
+5. **Execution**: GTest runs each problem size as a separate test case, comparing
+   device results against the CPU reference.
+
+## Configuration
+
+### `simple_test_config.json`
+
+Defines:
+- **tile_config**: Block/warp/thread tile dimensions for PoolShape
+- **trait_config**: Reduce op (max/avg), output_index, propagate_nan, pooling_dim (2d/3d)
+- **test_params**: Problem sizes (N, H, W, C, window, stride, dilation, padding)
+
+### Supported configurations
+
+- **Data types**: fp16, fp32
+- **Reduce operations**: max (with index output)
+- **Pooling dimensions**: 2D (NHWC), 3D (NDHWC)
+- **GPU targets**: gfx90a, gfx942
+
+## Building
+
+```bash
+# From the build directory:
+cmake --build . --target test_pooling_tile_engine_fp16_simple_test_config_max_true_false_2d_128x1_1x1_128x1_2x1
+
+# Or build all pooling tests:
+cmake --build . --target tests
+```
+
+## Running
+
+```bash
+# Run a specific test:
+./test_pooling_tile_engine_fp16_simple_test_config_max_true_false_2d_128x1_1x1_128x1_2x1
+
+# Run with GTest filters:
+./test_pooling_tile_engine_fp16_simple_test_config_max_true_false_2d_128x1_1x1_128x1_2x1 --gtest_filter="*BasicFunctionality*"
+```
+
+## Relationship to tile_engine
+
+The tile_engine pooling op lives at `tile_engine/ops/pooling/` and provides:
+- `pooling_instance_builder.py` - Codegen for kernel headers
+- `pooling_validation_utils.py` - Configuration validation
+- `pooling_common.hpp` - Shared trait definitions
+- `pooling_benchmark.hpp` - Problem/metric definitions
+- `pooling_benchmark_single.cpp` - Single-kernel benchmark entry point
+
+The underlying ck_tile pooling kernel lives at `include/ck_tile/ops/pooling/` and provides:
+- `PoolKernel` - GPU kernel implementation
+- `PoolProblem` - Problem parameterization
+- `PoolShape` - Tile shape specification
+- `PoolDefaultPolicy` - Tile distribution and reduction policies
diff --git a/test/ck_tile/pooling_tile_engine/configs/comprehensive_coverage_config.json b/test/ck_tile/pooling_tile_engine/configs/comprehensive_coverage_config.json
new file mode 100644
index 0000000000..0c9a6dfc7a
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/configs/comprehensive_coverage_config.json
@@ -0,0 +1,165 @@
+{
+    "problem": {
+        "description": "Comprehensive pooling coverage testing - multiple block sizes (64-512), warp configurations, thread tile sizes, and all trait combinations (max/avg, index, NaN propagation). Approximately 200+ kernels."
+    },
+    "test_params": {
+        "problem_sizes_2d": [
+            {
+                "_comment": "Basic: small tensor, 2x2 window, stride 2, no padding",
+                "N": 1, "H": 8, "W": 8, "C": 32,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Padded 3x3: moderate tensor with symmetric padding, stride 1 (overlapping)",
+                "N": 1, "H": 16, "W": 16, "C": 64,
+                "Y": 3, "X": 3,
+                "stride_h": 1, "stride_w": 1,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 1, "pad_h_right": 1,
+                "pad_w_left": 1, "pad_w_right": 1
+            },
+            {
+                "_comment": "Large channels: stress-test the C dimension",
+                "N": 1, "H": 16, "W": 16, "C": 256,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Large batch: multi-batch correctness",
+                "N": 4, "H": 16, "W": 16, "C": 32,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Non-square spatial: rectangular H != W",
+                "N": 2, "H": 32, "W": 16, "C": 64,
+                "Y": 3, "X": 3,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 1, "pad_h_right": 1,
+                "pad_w_left": 1, "pad_w_right": 1
+            },
+            {
+                "_comment": "Large window 5x5: bigger receptive field",
+                "N": 1, "H": 32, "W": 32, "C": 32,
+                "Y": 5, "X": 5,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 2, "pad_h_right": 2,
+                "pad_w_left": 2, "pad_w_right": 2
+            },
+            {
+                "_comment": "Large window 7x7: global-style pooling",
+                "N": 1, "H": 14, "W": 14, "C": 128,
+                "Y": 7, "X": 7,
+                "stride_h": 1, "stride_w": 1,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 3, "pad_h_right": 3,
+                "pad_w_left": 3, "pad_w_right": 3
+            },
+            {
+                "_comment": "Dilated: dilation_h=2, dilation_w=2 with 3x3 window",
+                "N": 1, "H": 32, "W": 32, "C": 64,
+                "Y": 3, "X": 3,
+                "stride_h": 1, "stride_w": 1,
+                "dilation_h": 2, "dilation_w": 2,
+                "pad_h_left": 2, "pad_h_right": 2,
+                "pad_w_left": 2, "pad_w_right": 2
+            },
+            {
+                "_comment": "Asymmetric padding: different left/right padding",
+                "N": 2, "H": 16, "W": 16, "C": 32,
+                "Y": 3, "X": 3,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 1,
+                "pad_w_left": 0, "pad_w_right": 1
+            },
+            {
+                "_comment": "Large spatial: bigger feature maps",
+                "N": 1, "H": 64, "W": 64, "C": 64,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Non-square window: Y != X",
+                "N": 1, "H": 32, "W": 32, "C": 32,
+                "Y": 3, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 1, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Stride-1 overlap: overlapping 2x2 windows",
+                "N": 2, "H": 16, "W": 16, "C": 64,
+                "Y": 2, "X": 2,
+                "stride_h": 1, "stride_w": 1,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            }
+        ],
+        "problem_sizes_3d": [
+            {
+                "_comment": "Basic 3D: small volume, 2x2x2 window",
+                "N": 1, "D": 4, "H": 4, "W": 4, "C": 32,
+                "Z": 2, "Y": 2, "X": 2,
+                "stride_d": 2, "stride_h": 2, "stride_w": 2,
+                "dilation_d": 1, "dilation_h": 1, "dilation_w": 1,
+                "pad_d_left": 0, "pad_d_right": 0,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "_comment": "Padded 3D: with symmetric padding",
+                "N": 1, "D": 8, "H": 8, "W": 8, "C": 32,
+                "Z": 3, "Y": 3, "X": 3,
+                "stride_d": 2, "stride_h": 2, "stride_w": 2,
+                "dilation_d": 1, "dilation_h": 1, "dilation_w": 1,
+                "pad_d_left": 1, "pad_d_right": 1,
+                "pad_h_left": 1, "pad_h_right": 1,
+                "pad_w_left": 1, "pad_w_right": 1
+            },
+            {
+                "_comment": "Multi-batch 3D: larger batch and channels",
+                "N": 2, "D": 8, "H": 8, "W": 8, "C": 64,
+                "Z": 2, "Y": 2, "X": 2,
+                "stride_d": 2, "stride_h": 2, "stride_w": 2,
+                "dilation_d": 1, "dilation_h": 1, "dilation_w": 1,
+                "pad_d_left": 0, "pad_d_right": 0,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            }
+        ]
+    },
+    "tile_config": {
+        "block_m": {"values": [64, 128, 256, 512]},
+        "block_n": {"values": [1]},
+        "warp_m": {"values": [1, 2, 4]},
+        "warp_n": {"values": [1]},
+        "warp_tile_m": {"values": [64, 128, 256]},
+        "warp_tile_n": {"values": [1]},
+        "thread_tile_m": {"values": [1, 2, 4]},
+        "thread_tile_n": {"values": [1]}
+    },
+    "trait_config": {
+        "reduce_op": {"values": ["max", "avg"]},
+        "output_index": {"values": [true, false]},
+        "propagate_nan": {"values": [true, false]},
+        "pooling_dim": {"values": ["2d", "3d"]}
+    }
+}
diff --git a/test/ck_tile/pooling_tile_engine/configs/simple_test_config.json b/test/ck_tile/pooling_tile_engine/configs/simple_test_config.json
new file mode 100644
index 0000000000..2ea9c376ce
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/configs/simple_test_config.json
@@ -0,0 +1,60 @@
+{
+    "problem": {
+        "description": "Basic pooling functionality validation with moderate problem sizes"
+    },
+    "test_params": {
+        "problem_sizes_2d": [
+            {
+                "N": 1, "H": 8, "W": 8, "C": 32,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            },
+            {
+                "N": 2, "H": 16, "W": 16, "C": 32,
+                "Y": 3, "X": 3,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 1, "pad_h_right": 1,
+                "pad_w_left": 1, "pad_w_right": 1
+            },
+            {
+                "N": 1, "H": 32, "W": 32, "C": 64,
+                "Y": 2, "X": 2,
+                "stride_h": 2, "stride_w": 2,
+                "dilation_h": 1, "dilation_w": 1,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            }
+        ],
+        "problem_sizes_3d": [
+            {
+                "N": 1, "D": 4, "H": 4, "W": 4, "C": 32,
+                "Z": 2, "Y": 2, "X": 2,
+                "stride_d": 2, "stride_h": 2, "stride_w": 2,
+                "dilation_d": 1, "dilation_h": 1, "dilation_w": 1,
+                "pad_d_left": 0, "pad_d_right": 0,
+                "pad_h_left": 0, "pad_h_right": 0,
+                "pad_w_left": 0, "pad_w_right": 0
+            }
+        ]
+    },
+    "tile_config": {
+        "block_m": {"values": [128]},
+        "block_n": {"values": [1]},
+        "warp_m": {"values": [1]},
+        "warp_n": {"values": [1]},
+        "warp_tile_m": {"values": [128]},
+        "warp_tile_n": {"values": [1]},
+        "thread_tile_m": {"values": [2]},
+        "thread_tile_n": {"values": [1]}
+    },
+    "trait_config": {
+        "reduce_op": {"values": ["max"]},
+        "output_index": {"values": [true]},
+        "propagate_nan": {"values": [false]},
+        "pooling_dim": {"values": ["2d"]}
+    }
+}
\ No newline at end of file
diff --git a/test/ck_tile/pooling_tile_engine/extract_test_params.py b/test/ck_tile/pooling_tile_engine/extract_test_params.py
new file mode 100644
index 0000000000..86c809dd36
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/extract_test_params.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Extract pooling test parameters from config JSON and write to C++ header.
+Generates test_params.hpp with problem sizes for parameterized GTest.
+"""
+
+import json
+import argparse
+import os
+from pathlib import Path
+
+
+def extract_test_params(config_file, output_file, pooling_dim="2d"):
+    """Extract test parameters from config JSON and write to output file"""
+
+    with open(config_file, "r") as f:
+        config = json.load(f)
+
+    # Extract test parameters based on pooling dimension
+    test_params = []
+    if pooling_dim == "2d":
+        if "test_params" in config and "problem_sizes_2d" in config["test_params"]:
+            test_params = config["test_params"]["problem_sizes_2d"]
+        else:
+            # Default 2D test parameters
+            test_params = [
+                {
+                    "N": 1,
+                    "H": 8,
+                    "W": 8,
+                    "C": 32,
+                    "Y": 2,
+                    "X": 2,
+                    "stride_h": 2,
+                    "stride_w": 2,
+                    "dilation_h": 1,
+                    "dilation_w": 1,
+                    "pad_h_left": 0,
+                    "pad_h_right": 0,
+                    "pad_w_left": 0,
+                    "pad_w_right": 0,
+                },
+                {
+                    "N": 2,
+                    "H": 16,
+                    "W": 16,
+                    "C": 32,
+                    "Y": 3,
+                    "X": 3,
+                    "stride_h": 2,
+                    "stride_w": 2,
+                    "dilation_h": 1,
+                    "dilation_w": 1,
+                    "pad_h_left": 1,
+                    "pad_h_right": 1,
+                    "pad_w_left": 1,
+                    "pad_w_right": 1,
+                },
+            ]
+    else:  # 3d
+        if "test_params" in config and "problem_sizes_3d" in config["test_params"]:
+            test_params = config["test_params"]["problem_sizes_3d"]
+        else:
+            # Default 3D test parameters
+            test_params = [
+                {
+                    "N": 1,
+                    "D": 4,
+                    "H": 4,
+                    "W": 4,
+                    "C": 32,
+                    "Z": 2,
+                    "Y": 2,
+                    "X": 2,
+                    "stride_d": 2,
+                    "stride_h": 2,
+                    "stride_w": 2,
+                    "dilation_d": 1,
+                    "dilation_h": 1,
+                    "dilation_w": 1,
+                    "pad_d_left": 0,
+                    "pad_d_right": 0,
+                    "pad_h_left": 0,
+                    "pad_h_right": 0,
+                    "pad_w_left": 0,
+                    "pad_w_right": 0,
+                },
+            ]
+
+    # Write to output file in C++ format
+    output_dir = Path(output_file).parent
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    with open(output_file, "w") as f:
+        f.write("// Generated test parameters for pooling tile_engine tests\n")
+        f.write("// This file is auto-generated during CMake configuration\n\n")
+
+        if pooling_dim == "2d":
+            f.write(
+                "static const std::vector<PoolTestParams2D> CONFIG_TEST_PARAMS = {\n"
+            )
+            for i, params in enumerate(test_params):
+                comma = "," if i < len(test_params) - 1 else ""
+                f.write(
+                    f"    {{"
+                    f"{params['N']}, {params['H']}, {params['W']}, {params['C']}, "
+                    f"{params['Y']}, {params['X']}, "
+                    f"{params['stride_h']}, {params['stride_w']}, "
+                    f"{params['dilation_h']}, {params['dilation_w']}, "
+                    f"{params['pad_h_left']}, {params['pad_h_right']}, "
+                    f"{params['pad_w_left']}, {params['pad_w_right']}"
+                    f"}}{comma}\n"
+                )
+            f.write("};\n")
+        else:  # 3d
+            f.write(
+                "static const std::vector<PoolTestParams3D> CONFIG_TEST_PARAMS = {\n"
+            )
+            for i, params in enumerate(test_params):
+                comma = "," if i < len(test_params) - 1 else ""
+                f.write(
+                    f"    {{"
+                    f"{params['N']}, {params['D']}, {params['H']}, {params['W']}, {params['C']}, "
+                    f"{params['Z']}, {params['Y']}, {params['X']}, "
+                    f"{params['stride_d']}, {params['stride_h']}, {params['stride_w']}, "
+                    f"{params['dilation_d']}, {params['dilation_h']}, {params['dilation_w']}, "
+                    f"{params['pad_d_left']}, {params['pad_d_right']}, "
+                    f"{params['pad_h_left']}, {params['pad_h_right']}, "
+                    f"{params['pad_w_left']}, {params['pad_w_right']}"
+                    f"}}{comma}\n"
+                )
+            f.write("};\n")
+
+    print(
+        f"Extracted {len(test_params)} {pooling_dim} test parameters from {config_file} -> {output_file}"
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract pooling test parameters from config JSON"
+    )
+    parser.add_argument("--config_file", required=True, help="Input config JSON file")
+    parser.add_argument(
+        "--output_file", required=True, help="Output test parameters file"
+    )
+    parser.add_argument(
+        "--pooling_dim",
+        default="2d",
+        choices=["2d", "3d"],
+        help="Pooling dimension (2d or 3d)",
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.config_file):
+        print(f"Error: Config file not found: {args.config_file}")
+        return 1
+
+    extract_test_params(args.config_file, args.output_file, args.pooling_dim)
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp b/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp
new file mode 100644
index 0000000000..dd9cb2a84a
--- /dev/null
+++ b/test/ck_tile/pooling_tile_engine/test_pooling_simple.cpp
@@ -0,0 +1,435 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file test_pooling_simple.cpp
+ * @brief Unit tests for pooling kernels generated by pooling_instance_builder
+ *
+ * This test includes kernels generated during CMake configuration by
+ * pooling_instance_builder.py and tests them with problem sizes extracted
+ * from the corresponding JSON configuration files.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/host/reference/reference_pool.hpp"
+#include "tile_engine/ops/pooling/pooling_common.hpp"
+
+// The kernel header is included via compile command line with -include flag
+// It defines: SelectedKernel, KERNEL_NAME, InDataType, OutDataType,
+//             ComputeDataType, IndexDataType, ReduceOpType,
+//             TensorShape, WindowShape, POOLING_DIM
+
+// ============================================================================
+// Test parameter structures
+// ============================================================================
+
+/// @brief Test parameters for 2D pooling
+struct PoolTestParams2D
+{
+    int N, H, W, C;              // Input dimensions (NHWC)
+    int Y, X;                    // Window size
+    int stride_h, stride_w;      // Strides
+    int dilation_h, dilation_w;  // Dilations
+    int pad_h_left, pad_h_right; // Height padding
+    int pad_w_left, pad_w_right; // Width padding
+};
+
+/// @brief Test parameters for 3D pooling
+struct PoolTestParams3D
+{
+    int N, D, H, W, C;                      // Input dimensions (NDHWC)
+    int Z, Y, X;                            // Window size
+    int stride_d, stride_h, stride_w;       // Strides
+    int dilation_d, dilation_h, dilation_w; // Dilations
+    int pad_d_left, pad_d_right;            // Depth padding
+    int pad_h_left, pad_h_right;            // Height padding
+    int pad_w_left, pad_w_right;            // Width padding
+};
+
+// Include config-specific test parameters (after parameter structs are defined)
+#ifdef POOLING_TEST_PARAMS_HPP
+#include POOLING_TEST_PARAMS_HPP
+#endif
+
+// POOLING_DIM_VALUE is set by CMake as a compile definition:
+//   2 for 2D pooling kernels, 3 for 3D pooling kernels.
+// This selects the appropriate test class and parameterization at compile time.
+
+#if POOLING_DIM_VALUE == 2
+// ============================================================================
+// 2D Pooling Tests
+// ============================================================================
+
+class PoolingTileEngineTest2D : public ::testing::TestWithParam<PoolTestParams2D>
+{
+    protected:
+    void SetUp() override
+    {
+        auto params  = GetParam();
+        N_           = params.N;
+        H_           = params.H;
+        W_           = params.W;
+        C_           = params.C;
+        Y_           = params.Y;
+        X_           = params.X;
+        stride_h_    = params.stride_h;
+        stride_w_    = params.stride_w;
+        dilation_h_  = params.dilation_h;
+        dilation_w_  = params.dilation_w;
+        pad_h_left_  = params.pad_h_left;
+        pad_h_right_ = params.pad_h_right;
+        pad_w_left_  = params.pad_w_left;
+        pad_w_right_ = params.pad_w_right;
+
+        // Calculate output dimensions
+        ck_tile::index_t Ys = (Y_ - 1) * dilation_h_ + 1;
+        ck_tile::index_t Xs = (X_ - 1) * dilation_w_ + 1;
+        Ho_                 = (H_ + pad_h_left_ + pad_h_right_ - Ys) / stride_h_ + 1;
+        Wo_                 = (W_ + pad_w_left_ + pad_w_right_ - Xs) / stride_w_ + 1;
+    }
+
+    int N_, H_, W_, C_;
+    int Y_, X_;
+    int stride_h_, stride_w_;
+    int dilation_h_, dilation_w_;
+    int pad_h_left_, pad_h_right_;
+    int pad_w_left_, pad_w_right_;
+    int Ho_, Wo_;
+};
+
+TEST_P(PoolingTileEngineTest2D, BasicFunctionality)
+{
+    // Create host tensors
+    ck_tile::HostTensor<InDataType> h_in({N_, H_, W_, C_});
+    ck_tile::HostTensor<OutDataType> h_out({N_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<OutDataType> h_out_ref({N_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<IndexDataType> h_out_index({N_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<IndexDataType> h_out_ref_index({N_, Ho_, Wo_, C_});
+
+    // Initialize input with random data
+    ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(h_in);
+    h_out.SetZero();
+    h_out_ref.SetZero();
+
+    // Device memory
+    ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes());
+
+    d_in.ToDevice(h_in.data());
+    d_out.SetZero();
+    d_out_index.SetZero();
+
+    // Build shapes and strides (NHWC layout)
+    auto input_shape      = ck_tile::make_tuple(N_, H_, W_, C_);
+    auto output_shape     = ck_tile::make_tuple(N_, Ho_, Wo_, C_);
+    auto input_strides    = ck_tile::make_tuple(H_ * W_ * C_, W_ * C_, C_, 1);
+    auto output_strides   = ck_tile::make_tuple(Ho_ * Wo_ * C_, Wo_ * C_, C_, 1);
+    auto window_lengths   = ck_tile::make_tuple(Y_, X_);
+    auto window_strides   = ck_tile::make_tuple(stride_h_, stride_w_);
+    auto window_dilations = ck_tile::make_tuple(dilation_h_, dilation_w_);
+    auto input_left_pads  = ck_tile::make_tuple(pad_h_left_, pad_w_left_);
+    auto input_right_pads = ck_tile::make_tuple(pad_h_right_, pad_w_right_);
+
+    // Build host args for the generated kernel
+    auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+        d_in.GetDeviceBuffer(),
+        d_out.GetDeviceBuffer(),
+        d_out_index.GetDeviceBuffer(),
+        input_shape,
+        output_shape,
+        input_strides,
+        output_strides,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        input_left_pads,
+        input_right_pads};
+
+    // Stream config: no timing overhead for fastest execution
+    ck_tile::stream_config stream_config{nullptr, false, 0, 0, 1, false, false, 1};
+
+    // Launch generated kernel
+    try
+    {
+        SelectedKernel::launch(host_args, stream_config);
+    }
+    catch(const std::exception& e)
+    {
+        std::string error_msg(e.what());
+        if(error_msg.find("Arguments not supported") != std::string::npos)
+        {
+            GTEST_SKIP() << "Configuration not supported: " << e.what();
+        }
+        else
+        {
+            FAIL() << "Kernel launch failed: " << e.what();
+        }
+    }
+
+    // Copy results back
+    d_out.FromDevice(h_out.data());
+    d_out_index.FromDevice(h_out_index.data());
+
+    // Compute reference on host
+    auto kernel_args_ref = ck_tile::PoolKernelArgs<decltype(input_shape), decltype(window_lengths)>{
+        h_in.data(),
+        h_out_ref.data(),
+        h_out_ref_index.data(),
+        input_shape,
+        output_shape,
+        input_strides,
+        output_strides,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        input_left_pads,
+        input_right_pads};
+
+    ck_tile::reference_pool2d<InDataType,
+                              ComputeDataType,
+                              OutDataType,
+                              IndexDataType,
+                              ReduceOpType,
+                              decltype(input_shape),
+                              decltype(window_lengths),
+                              SelectedKernel::kOutputIndex>(
+        h_in, h_out_ref, h_out_ref_index, kernel_args_ref, ReduceOpType{});
+
+    // Verify value results
+    bool pass_value = ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-5, 1e-5);
+    EXPECT_TRUE(pass_value) << "Pooling value verification failed for " << KERNEL_NAME;
+
+    // Verify index results if output_index is enabled
+    if constexpr(SelectedKernel::kOutputIndex)
+    {
+        bool pass_index =
+            ck_tile::check_err(h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0);
+        EXPECT_TRUE(pass_index) << "Pooling index verification failed for " << KERNEL_NAME;
+    }
+}
+
+TEST_P(PoolingTileEngineTest2D, KernelInfo)
+{
+    EXPECT_TRUE(std::string_view(KERNEL_NAME).size() > 0) << "Kernel name should not be empty";
+
+    std::cout << "Testing kernel: " << KERNEL_NAME << std::endl;
+    std::cout << "Problem size: N=" << N_ << " H=" << H_ << " W=" << W_ << " C=" << C_
+              << " Window=" << Y_ << "x" << X_ << " Output=" << Ho_ << "x" << Wo_ << std::endl;
+}
+
+// Instantiate test suite with config-specific test parameters
+// CONFIG_TEST_PARAMS is defined in the auto-generated test_params_2d.hpp file
+INSTANTIATE_TEST_SUITE_P(PoolingVerification,
+                         PoolingTileEngineTest2D,
+                         ::testing::ValuesIn(CONFIG_TEST_PARAMS),
+                         [](const ::testing::TestParamInfo<PoolTestParams2D>& param_info) {
+                             return "N" + std::to_string(param_info.param.N) + "_H" +
+                                    std::to_string(param_info.param.H) + "_W" +
+                                    std::to_string(param_info.param.W) + "_C" +
+                                    std::to_string(param_info.param.C) + "_Y" +
+                                    std::to_string(param_info.param.Y) + "_X" +
+                                    std::to_string(param_info.param.X);
+                         });
+
+#elif POOLING_DIM_VALUE == 3
+// ============================================================================
+// 3D Pooling Tests
+// ============================================================================
+
+class PoolingTileEngineTest3D : public ::testing::TestWithParam<PoolTestParams3D>
+{
+    protected:
+    void SetUp() override
+    {
+        auto params  = GetParam();
+        N_           = params.N;
+        D_           = params.D;
+        H_           = params.H;
+        W_           = params.W;
+        C_           = params.C;
+        Z_           = params.Z;
+        Y_           = params.Y;
+        X_           = params.X;
+        stride_d_    = params.stride_d;
+        stride_h_    = params.stride_h;
+        stride_w_    = params.stride_w;
+        dilation_d_  = params.dilation_d;
+        dilation_h_  = params.dilation_h;
+        dilation_w_  = params.dilation_w;
+        pad_d_left_  = params.pad_d_left;
+        pad_d_right_ = params.pad_d_right;
+        pad_h_left_  = params.pad_h_left;
+        pad_h_right_ = params.pad_h_right;
+        pad_w_left_  = params.pad_w_left;
+        pad_w_right_ = params.pad_w_right;
+
+        // Calculate output dimensions
+        ck_tile::index_t Zs = (Z_ - 1) * dilation_d_ + 1;
+        ck_tile::index_t Ys = (Y_ - 1) * dilation_h_ + 1;
+        ck_tile::index_t Xs = (X_ - 1) * dilation_w_ + 1;
+        Do_                 = (D_ + pad_d_left_ + pad_d_right_ - Zs) / stride_d_ + 1;
+        Ho_                 = (H_ + pad_h_left_ + pad_h_right_ - Ys) / stride_h_ + 1;
+        Wo_                 = (W_ + pad_w_left_ + pad_w_right_ - Xs) / stride_w_ + 1;
+    }
+
+    int N_, D_, H_, W_, C_;
+    int Z_, Y_, X_;
+    int stride_d_, stride_h_, stride_w_;
+    int dilation_d_, dilation_h_, dilation_w_;
+    int pad_d_left_, pad_d_right_;
+    int pad_h_left_, pad_h_right_;
+    int pad_w_left_, pad_w_right_;
+    int Do_, Ho_, Wo_;
+};
+
+TEST_P(PoolingTileEngineTest3D, BasicFunctionality)
+{
+    // Create host tensors (NDHWC layout)
+    ck_tile::HostTensor<InDataType> h_in({N_, D_, H_, W_, C_});
+    ck_tile::HostTensor<OutDataType> h_out({N_, Do_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<OutDataType> h_out_ref({N_, Do_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<IndexDataType> h_out_index({N_, Do_, Ho_, Wo_, C_});
+    ck_tile::HostTensor<IndexDataType> h_out_ref_index({N_, Do_, Ho_, Wo_, C_});
+
+    // Initialize input with random data
+    ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(h_in);
+    h_out.SetZero();
+    h_out_ref.SetZero();
+
+    // Device memory
+    ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes());
+
+    d_in.ToDevice(h_in.data());
+    d_out.SetZero();
+    d_out_index.SetZero();
+
+    // Build shapes and strides (NDHWC layout)
+    auto input_shape   = ck_tile::make_tuple(N_, D_, H_, W_, C_);
+    auto output_shape  = ck_tile::make_tuple(N_, Do_, Ho_, Wo_, C_);
+    auto input_strides = ck_tile::make_tuple(D_ * H_ * W_ * C_, H_ * W_ * C_, W_ * C_, C_, 1);
+    auto output_strides =
+        ck_tile::make_tuple(Do_ * Ho_ * Wo_ * C_, Ho_ * Wo_ * C_, Wo_ * C_, C_, 1);
+    auto window_lengths   = ck_tile::make_tuple(Z_, Y_, X_);
+    auto window_strides   = ck_tile::make_tuple(stride_d_, stride_h_, stride_w_);
+    auto window_dilations = ck_tile::make_tuple(dilation_d_, dilation_h_, dilation_w_);
+    auto input_left_pads  = ck_tile::make_tuple(pad_d_left_, pad_h_left_, pad_w_left_);
+    auto input_right_pads = ck_tile::make_tuple(pad_d_right_, pad_h_right_, pad_w_right_);
+
+    // Build host args for the generated kernel
+    auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+        d_in.GetDeviceBuffer(),
+        d_out.GetDeviceBuffer(),
+        d_out_index.GetDeviceBuffer(),
+        input_shape,
+        output_shape,
+        input_strides,
+        output_strides,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        input_left_pads,
+        input_right_pads};
+
+    // Stream config: no timing overhead for fastest execution
+    ck_tile::stream_config stream_config{nullptr, false, 0, 0, 1, false, false, 1};
+
+    // Launch generated kernel
+    try
+    {
+        SelectedKernel::launch(host_args, stream_config);
+    }
+    catch(const std::exception& e)
+    {
+        std::string error_msg(e.what());
+        if(error_msg.find("Arguments not supported") != std::string::npos)
+        {
+            GTEST_SKIP() << "Configuration not supported: " << e.what();
+        }
+        else
+        {
+            FAIL() << "Kernel launch failed: " << e.what();
+        }
+    }
+
+    // Copy results back
+    d_out.FromDevice(h_out.data());
+    d_out_index.FromDevice(h_out_index.data());
+
+    // Compute reference on host
+    auto kernel_args_ref = ck_tile::PoolKernelArgs<decltype(input_shape), decltype(window_lengths)>{
+        h_in.data(),
+        h_out_ref.data(),
+        h_out_ref_index.data(),
+        input_shape,
+        output_shape,
+        input_strides,
+        output_strides,
+        window_lengths,
+        window_strides,
+        window_dilations,
+        input_left_pads,
+        input_right_pads};
+
+    ck_tile::reference_pool3d<InDataType,
+                              ComputeDataType,
+                              OutDataType,
+                              IndexDataType,
+                              ReduceOpType,
+                              decltype(input_shape),
+                              decltype(window_lengths),
+                              SelectedKernel::kOutputIndex>(
+        h_in, h_out_ref, h_out_ref_index, kernel_args_ref, ReduceOpType{});
+
+    // Verify value results
+    bool pass_value = ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-5, 1e-5);
+    EXPECT_TRUE(pass_value) << "Pooling 3D value verification failed for " << KERNEL_NAME;
+
+    // Verify index results if output_index is enabled
+    if constexpr(SelectedKernel::kOutputIndex)
+    {
+        bool pass_index =
+            ck_tile::check_err(h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0);
+        EXPECT_TRUE(pass_index) << "Pooling 3D index verification failed for " << KERNEL_NAME;
+    }
+}
+
+TEST_P(PoolingTileEngineTest3D, KernelInfo)
+{
+    EXPECT_TRUE(std::string_view(KERNEL_NAME).size() > 0) << "Kernel name should not be empty";
+
+    std::cout << "Testing kernel: " << KERNEL_NAME << std::endl;
+    std::cout << "Problem size: N=" << N_ << " D=" << D_ << " H=" << H_ << " W=" << W_
+              << " C=" << C_ << " Window=" << Z_ << "x" << Y_ << "x" << X_ << " Output=" << Do_
+              << "x" << Ho_ << "x" << Wo_ << std::endl;
+}
+
+// Instantiate test suite with config-specific test parameters
+// CONFIG_TEST_PARAMS is defined in the auto-generated test_params_3d.hpp file
+INSTANTIATE_TEST_SUITE_P(PoolingVerification,
+                         PoolingTileEngineTest3D,
+                         ::testing::ValuesIn(CONFIG_TEST_PARAMS),
+                         [](const ::testing::TestParamInfo<PoolTestParams3D>& param_info) {
+                             return "N" + std::to_string(param_info.param.N) + "_D" +
+                                    std::to_string(param_info.param.D) + "_H" +
+                                    std::to_string(param_info.param.H) + "_W" +
+                                    std::to_string(param_info.param.W) + "_C" +
+                                    std::to_string(param_info.param.C) + "_Z" +
+                                    std::to_string(param_info.param.Z) + "_Y" +
+                                    std::to_string(param_info.param.Y) + "_X" +
+                                    std::to_string(param_info.param.X);
+                         });
+
+#else
+#error "POOLING_DIM_VALUE must be 2 or 3"
+#endif
diff --git a/test/ck_tile/utility/CMakeLists.txt b/test/ck_tile/utility/CMakeLists.txt
index 42bdb26e1d..2a377139b8 100644
--- a/test/ck_tile/utility/CMakeLists.txt
+++ b/test/ck_tile/utility/CMakeLists.txt
@@ -5,6 +5,7 @@ message("-- Adding: test/ck_tile/utility/")
 
 add_gtest_executable(test_fill test_fill.cpp)
 add_gtest_executable(test_ck_tile_sequence test_sequence.cpp)
+add_gtest_executable(test_ck_tile_static_ford test_static_ford.cpp)
 
 # Add print tests
 add_subdirectory(print)
diff --git a/test/ck_tile/utility/test_static_ford.cpp b/test/ck_tile/utility/test_static_ford.cpp
new file mode 100644
index 0000000000..7337471647
--- /dev/null
+++ b/test/ck_tile/utility/test_static_ford.cpp
@@ -0,0 +1,293 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <tuple>
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+
+using namespace ck_tile;
+
+// ============================================================================
+// static_ford Tests — Identity Order (default)
+// ============================================================================
+
+TEST(CkTileStaticFord, Identity2D)
+{
+    std::vector<std::pair<index_t, index_t>> visited;
+
+    static_ford<sequence<2, 3>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(visited.size(), 6u);
+    EXPECT_EQ(visited[0], std::make_pair(0, 0));
+    EXPECT_EQ(visited[1], std::make_pair(0, 1));
+    EXPECT_EQ(visited[2], std::make_pair(0, 2));
+    EXPECT_EQ(visited[3], std::make_pair(1, 0));
+    EXPECT_EQ(visited[4], std::make_pair(1, 1));
+    EXPECT_EQ(visited[5], std::make_pair(1, 2));
+}
+
+TEST(CkTileStaticFord, Identity3D)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    static_ford<sequence<2, 3, 2>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 12u);
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(0, 0, 1));
+    EXPECT_EQ(visited[2], std::make_tuple(0, 1, 0));
+    EXPECT_EQ(visited[3], std::make_tuple(0, 1, 1));
+    EXPECT_EQ(visited[4], std::make_tuple(0, 2, 0));
+    EXPECT_EQ(visited[5], std::make_tuple(0, 2, 1));
+    EXPECT_EQ(visited[6], std::make_tuple(1, 0, 0));
+    EXPECT_EQ(visited[7], std::make_tuple(1, 0, 1));
+    EXPECT_EQ(visited[8], std::make_tuple(1, 1, 0));
+    EXPECT_EQ(visited[9], std::make_tuple(1, 1, 1));
+    EXPECT_EQ(visited[10], std::make_tuple(1, 2, 0));
+    EXPECT_EQ(visited[11], std::make_tuple(1, 2, 1));
+}
+
+TEST(CkTileStaticFord, Identity1D)
+{
+    std::vector<index_t> visited;
+
+    static_ford<sequence<5>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        visited.push_back(i);
+    });
+
+    ASSERT_EQ(visited.size(), 5u);
+    for(index_t i = 0; i < 5; ++i)
+    {
+        EXPECT_EQ(visited[i], i);
+    }
+}
+
+TEST(CkTileStaticFord, SingleElement1D)
+{
+    std::vector<index_t> visited;
+
+    static_ford<sequence<1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        visited.push_back(i);
+    });
+
+    ASSERT_EQ(visited.size(), 1u);
+    EXPECT_EQ(visited[0], 0);
+}
+
+TEST(CkTileStaticFord, SingleElement2D)
+{
+    std::vector<std::pair<index_t, index_t>> visited;
+
+    static_ford<sequence<1, 1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(visited.size(), 1u);
+    EXPECT_EQ(visited[0], std::make_pair(0, 0));
+}
+
+TEST(CkTileStaticFord, IdentityWithUnitDim)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    static_ford<sequence<2, 1, 3>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 6u);
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(0, 0, 1));
+    EXPECT_EQ(visited[2], std::make_tuple(0, 0, 2));
+    EXPECT_EQ(visited[3], std::make_tuple(1, 0, 0));
+    EXPECT_EQ(visited[4], std::make_tuple(1, 0, 1));
+    EXPECT_EQ(visited[5], std::make_tuple(1, 0, 2));
+}
+
+// ============================================================================
+// static_ford Tests — Non-Identity Order (primary template with decompose_reordered)
+// ============================================================================
+
+TEST(CkTileStaticFord, ReversedOrder2D)
+{
+    std::vector<std::pair<index_t, index_t>> visited;
+
+    // Order (1, 0): dim 1 is outer, dim 0 is inner (column-major)
+    static_ford<sequence<2, 3>, sequence<1, 0>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(visited.size(), 6u);
+    EXPECT_EQ(visited[0], std::make_pair(0, 0));
+    EXPECT_EQ(visited[1], std::make_pair(1, 0));
+    EXPECT_EQ(visited[2], std::make_pair(0, 1));
+    EXPECT_EQ(visited[3], std::make_pair(1, 1));
+    EXPECT_EQ(visited[4], std::make_pair(0, 2));
+    EXPECT_EQ(visited[5], std::make_pair(1, 2));
+}
+
+TEST(CkTileStaticFord, CustomOrder3D_201)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    // Orders<2,0,1>: dim 2 outermost, dim 0 middle, dim 1 innermost
+    static_ford<sequence<2, 3, 4>, sequence<2, 0, 1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 24u);
+    // With orders (2,0,1): k varies slowest, then i, then j fastest
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(0, 1, 0));
+    EXPECT_EQ(visited[2], std::make_tuple(0, 2, 0));
+    EXPECT_EQ(visited[3], std::make_tuple(1, 0, 0));
+    EXPECT_EQ(visited[4], std::make_tuple(1, 1, 0));
+    EXPECT_EQ(visited[5], std::make_tuple(1, 2, 0));
+    EXPECT_EQ(visited[6], std::make_tuple(0, 0, 1));
+    EXPECT_EQ(visited[7], std::make_tuple(0, 1, 1));
+    // Tail: last element should be (1, 2, 3)
+    EXPECT_EQ(visited[23], std::make_tuple(1, 2, 3));
+}
+
+TEST(CkTileStaticFord, CustomOrder3D_120)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    // Orders<1,2,0>: dim 1 outermost, dim 2 middle, dim 0 innermost
+    static_ford<sequence<2, 3, 2>, sequence<1, 2, 0>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 12u);
+    // With orders (1,2,0): j varies slowest, then k, then i fastest
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(1, 0, 0));
+    EXPECT_EQ(visited[2], std::make_tuple(0, 0, 1));
+    EXPECT_EQ(visited[3], std::make_tuple(1, 0, 1));
+    EXPECT_EQ(visited[4], std::make_tuple(0, 1, 0));
+    EXPECT_EQ(visited[5], std::make_tuple(1, 1, 0));
+    // Tail: last element should be (1, 2, 1)
+    EXPECT_EQ(visited[11], std::make_tuple(1, 2, 1));
+}
+
+TEST(CkTileStaticFord, NonIdentityWithUnitDim)
+{
+    std::vector<std::tuple<index_t, index_t, index_t>> visited;
+
+    // Unit dim at position 1 with non-trivial order
+    static_ford<sequence<2, 1, 3>, sequence<2, 0, 1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        constexpr index_t k = multi_id[number<2>{}];
+        visited.emplace_back(i, j, k);
+    });
+
+    ASSERT_EQ(visited.size(), 6u);
+    // All entries must have j == 0 (unit dimension)
+    for(size_t idx = 0; idx < visited.size(); ++idx)
+    {
+        EXPECT_EQ(std::get<1>(visited[idx]), 0) << "Unit dim not zero at iteration " << idx;
+    }
+}
+
+TEST(CkTileStaticFord, CustomOrder4D)
+{
+    std::vector<std::tuple<index_t, index_t, index_t, index_t>> visited;
+
+    // 4D with order <3,1,0,2>
+    static_ford<sequence<2, 3, 2, 4>, sequence<3, 1, 0, 2>>{}([&](auto multi_id) {
+        constexpr index_t a = multi_id[number<0>{}];
+        constexpr index_t b = multi_id[number<1>{}];
+        constexpr index_t c = multi_id[number<2>{}];
+        constexpr index_t d = multi_id[number<3>{}];
+        visited.emplace_back(a, b, c, d);
+    });
+
+    ASSERT_EQ(visited.size(), 48u);
+    // dim 3 (size 4) outermost, dim 1 (size 3) next, dim 0 (size 2) next, dim 2 (size 2) inner
+    EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0, 0));
+    EXPECT_EQ(visited[1], std::make_tuple(0, 0, 1, 0));
+    EXPECT_EQ(visited[2], std::make_tuple(1, 0, 0, 0));
+    EXPECT_EQ(visited[3], std::make_tuple(1, 0, 1, 0));
+    EXPECT_EQ(visited[4], std::make_tuple(0, 1, 0, 0));
+    EXPECT_EQ(visited[5], std::make_tuple(0, 1, 1, 0));
+}
+
+TEST(CkTileStaticFord, AsymmetricDimsWithOrder)
+{
+    std::vector<std::pair<index_t, index_t>> visited;
+
+    // Asymmetric: 3x5 with reversed order
+    static_ford<sequence<3, 5>, sequence<1, 0>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(visited.size(), 15u);
+    // dim 1 (size 5) outer, dim 0 (size 3) inner
+    EXPECT_EQ(visited[0], std::make_pair(0, 0));
+    EXPECT_EQ(visited[1], std::make_pair(1, 0));
+    EXPECT_EQ(visited[2], std::make_pair(2, 0));
+    EXPECT_EQ(visited[3], std::make_pair(0, 1));
+    EXPECT_EQ(visited[4], std::make_pair(1, 1));
+    EXPECT_EQ(visited[5], std::make_pair(2, 1));
+}
+
+// ============================================================================
+// Consistency: identity order matches explicit identity order
+// ============================================================================
+
+TEST(CkTileStaticFord, IdentityOrderMatchesExplicit)
+{
+    std::vector<std::pair<index_t, index_t>> default_visited;
+    std::vector<std::pair<index_t, index_t>> explicit_visited;
+
+    static_ford<sequence<3, 4>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        default_visited.emplace_back(i, j);
+    });
+
+    static_ford<sequence<3, 4>, sequence<0, 1>>{}([&](auto multi_id) {
+        constexpr index_t i = multi_id[number<0>{}];
+        constexpr index_t j = multi_id[number<1>{}];
+        explicit_visited.emplace_back(i, j);
+    });
+
+    ASSERT_EQ(default_visited.size(), explicit_visited.size());
+    for(size_t i = 0; i < default_visited.size(); ++i)
+    {
+        EXPECT_EQ(default_visited[i], explicit_visited[i]) << "Mismatch at iteration " << i;
+    }
+}
+
+// index_decomposer and inverse_perm are implementation details tested
+// indirectly through the static_ford behavioral tests above.
+// The IdentityOrderMatchesExplicit test verifies both code paths
+// (identity specialization and primary template) produce identical results.
diff --git a/tile_engine/CMakeLists.txt b/tile_engine/CMakeLists.txt
index b9dc320128..36f479d8e6 100644
--- a/tile_engine/CMakeLists.txt
+++ b/tile_engine/CMakeLists.txt
@@ -5,7 +5,8 @@ include_directories(BEFORE
         ${CMAKE_CURRENT_LIST_DIR}/include
     )
 
-add_subdirectory(ops/gemm)
-add_subdirectory(ops/gemm_streamk)
-add_subdirectory(ops/reduce)
+add_subdirectory(ops/gemm EXCLUDE_FROM_ALL)
+add_subdirectory(ops/gemm_streamk EXCLUDE_FROM_ALL)
+add_subdirectory(ops/pooling EXCLUDE_FROM_ALL)
+add_subdirectory(ops/reduce EXCLUDE_FROM_ALL)
 
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index ba5d34b9a2..94131f2cf1 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
-add_subdirectory(gemm_universal)
-add_subdirectory(gemm_multi_d)
-add_subdirectory(gemm_preshuffle)
-add_subdirectory(grouped_gemm)
\ No newline at end of file
+add_subdirectory(gemm_universal EXCLUDE_FROM_ALL)
+add_subdirectory(gemm_multi_d EXCLUDE_FROM_ALL)
+add_subdirectory(gemm_preshuffle EXCLUDE_FROM_ALL)
+add_subdirectory(grouped_gemm EXCLUDE_FROM_ALL)
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/gemm_multi_d/CMakeLists.txt b/tile_engine/ops/gemm/gemm_multi_d/CMakeLists.txt
index b5f9a4b177..0e0a54f66a 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/CMakeLists.txt
+++ b/tile_engine/ops/gemm/gemm_multi_d/CMakeLists.txt
@@ -231,7 +231,7 @@ message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx90a, gfx942, gfx950
 set(GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -242,7 +242,7 @@ endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine GEMM Multi D build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine GEMM Multi D build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
     message(VERBOSE "Building individual GEMM Multi D targets for GPU targets: ${GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL}")
 
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/CMakeLists.txt b/tile_engine/ops/gemm/gemm_preshuffle/CMakeLists.txt
index ad93007fe3..c6ca819a70 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/CMakeLists.txt
+++ b/tile_engine/ops/gemm/gemm_preshuffle/CMakeLists.txt
@@ -219,7 +219,7 @@ message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx90a, gfx942, and gfx950
 set(GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -230,7 +230,7 @@ endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
     message(VERBOSE "Building individual GEMM Preshuffle targets for GPU targets: ${GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL}")
 
diff --git a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
index 7505fcd6d0..df93f1a4ee 100644
--- a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
+++ b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
@@ -226,7 +226,7 @@ message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx90a, gfx942, gfx950, gfx1201
 set(GEMM_UNIVERSAL_GPU_TARGETS_INDIVIDUAL "")
-set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -237,7 +237,7 @@ endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_UNIVERSAL_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine GEMM Universal build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine GEMM Universal build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
     message(VERBOSE "Building individual GEMM Universal targets for GPU targets: ${GEMM_UNIVERSAL_GPU_TARGETS_INDIVIDUAL}")
 
diff --git a/tile_engine/ops/gemm/gemm_validation_utils.py b/tile_engine/ops/gemm/gemm_validation_utils.py
index 1af45f8e90..aa3c04cf95 100644
--- a/tile_engine/ops/gemm/gemm_validation_utils.py
+++ b/tile_engine/ops/gemm/gemm_validation_utils.py
@@ -25,6 +25,16 @@ ELEMENT_SIZE_MAP = {
     "fp64": 8,
 }
 
+def get_warp_size_for_gpu(gpu_target: str) -> int:
+    """Get the warp size for a given GPU target.
+
+    CDNA architectures (gfx9xx) use WAVE64 (64 threads per wavefront).
+    RDNA architectures (gfx10xx, gfx11xx, gfx12xx) use WAVE32 (32 threads per wavefront).
+    """
+    if gpu_target.startswith("gfx9"):
+        return 64  # CDNA - WAVE64
+    return 32  # RDNA and others - WAVE32
+
 WARP_SUPPORTED_COMBINATIONS = {
     "gfx90a": [
         [1, 4, 1],
@@ -586,10 +596,11 @@ def validate_whole_wg_cover_configuration(
     layout,
     a_datatype,
     b_datatype,
+    gpu_target: str = "gfx90a",
 ) -> Tuple[bool, str]:
     # Validate whole workgroup cover configuration
 
-    warp_size = 64
+    warp_size = get_warp_size_for_gpu(gpu_target)
     NumWarps = warp_m * warp_n * warp_k
     BlockSize = NumWarps * warp_size
 
@@ -704,6 +715,73 @@ def wg_cover_core_validation(
     return True, ""
 
 
+def validate_cshuffle_epilogue_distribution(
+    tile_m: int,
+    tile_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_k: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    warp_size: int,
+    c_datatype: str,
+) -> Tuple[bool, str]:
+    """
+    Validate that the CShuffleEpilogue tile distribution pattern is valid.
+
+    This mirrors the static_assert in static_encoding_pattern.hpp:
+        static_assert(X0 * Y1 == warp_size, "X0 * Y1 must cover whole wavefront!");
+
+    The CShuffleEpilogue creates a tile_distribution_encoding_pattern_2d<BlockSize, YPerTile, XPerTile, VecSize, thread_raked>
+    where:
+    - BlockSize = warp_m * warp_n * warp_k * warp_size
+    - YPerTile = MPerIterationShuffle (derived from tile_m / (warp_m * warp_tile_m / some_factor))
+    - XPerTile = NPerIterationShuffle (derived from tile_n)
+    - VecSize = vector size based on element size (typically 8 for fp16)
+
+    The key constraint is that X0 must evenly divide warp_size, where:
+    - X0 = min(warp_size, XPerTile / X1)
+    - X1 = min(VecSize, LargestVec)
+    - LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size)
+    """
+    NumWarps = warp_m * warp_n * warp_k
+    BlockSize = NumWarps * warp_size
+
+    elem_size = ELEMENT_SIZE_MAP.get(c_datatype, 2)
+    VecSize = 16 // elem_size
+
+    XPerTile = tile_n
+    YPerTile = tile_m // warp_m
+
+    if XPerTile <= 0 or YPerTile <= 0:
+        return False, f"Invalid tile dimensions: XPerTile={XPerTile}, YPerTile={YPerTile}"
+
+    num_warps = BlockSize // warp_size
+    if num_warps * warp_size == 0:
+        return False, "Invalid BlockSize or warp_size"
+
+    LargestVec = (XPerTile * YPerTile) // (num_warps * warp_size)
+    if LargestVec <= 0:
+        LargestVec = 1
+
+    X1 = min(VecSize, LargestVec) if LargestVec > 0 else VecSize
+    if X1 <= 0:
+        X1 = 1
+
+    X0 = min(warp_size, XPerTile // X1) if X1 > 0 else warp_size
+
+    Y1 = warp_size // X0 if X0 > 0 else 0
+
+    if X0 * Y1 != warp_size:
+        return (
+            False,
+            f"CShuffleEpilogue distribution invalid: X0({X0}) * Y1({Y1}) = {X0 * Y1} != warp_size({warp_size}). "
+            f"XPerTile={XPerTile}, YPerTile={YPerTile}, VecSize={VecSize}, BlockSize={BlockSize}"
+        )
+
+    return True, ""
+
+
 def get_global_vector_load_size(
     BlockSize: int,
     KPerBlock: int,
@@ -766,6 +844,8 @@ def validate_gemm(
     trait_name: str = None,
 ) -> bool:
     # GEMM Validation
+    warp_size = get_warp_size_for_gpu(gpu_target)
+
     # Validate whole workgroup cover configuration
     whole_workgroup_cover_valid, whole_workgroup_cover_error = (
         validate_whole_wg_cover_configuration(
@@ -778,6 +858,7 @@ def validate_gemm(
             layout,
             a_datatype,
             b_datatype,
+            gpu_target,
         )
     )
     if not whole_workgroup_cover_valid:
@@ -786,6 +867,23 @@ def validate_gemm(
         )
         return False, whole_workgroup_cover_error
 
+    # Validate CShuffleEpilogue distribution pattern (for cshuffle epilogue)
+    # This validation ensures the tile distribution pattern is valid for the output tile
+    cshuffle_valid, cshuffle_error = validate_cshuffle_epilogue_distribution(
+        tile_m,
+        tile_n,
+        warp_m,
+        warp_n,
+        warp_k,
+        warp_tile_m,
+        warp_tile_n,
+        warp_size,
+        c_datatype,
+    )
+    if not cshuffle_valid:
+        logging.debug(f"CShuffleEpilogue validation failed: {cshuffle_error}")
+        return False, cshuffle_error
+
     return True, ""
 
 
@@ -808,6 +906,8 @@ def validate_gemm_preshuffle(
     trait_name: str = None,
 ) -> bool:
     # Preshuffle Validations
+    warp_size = get_warp_size_for_gpu(gpu_target)
+
     # Validate vector load alignment
     m_iter_per_warp = tile_m / (warp_m * warp_tile_m)
     vector_valid, vector_error = validate_vector_load_alignment(
@@ -815,7 +915,7 @@ def validate_gemm_preshuffle(
         warp_tile_k,
         a_datatype,
         m_iter_per_warp,
-        wave_size=64,
+        wave_size=warp_size,
         vector_load_size=16,
     )
     if not vector_valid:
@@ -831,7 +931,7 @@ def validate_gemm_preshuffle(
         warp_k,
         a_datatype,
         vector_load_size=16,
-        warp_size=64,
+        warp_size=warp_size,
     )
     if not m0_m1_m2_valid:
         logging.debug(f"M0/M1/M2 configuration validation failed: {m0_m1_m2_error}")
diff --git a/tile_engine/ops/gemm/grouped_gemm/CMakeLists.txt b/tile_engine/ops/gemm/grouped_gemm/CMakeLists.txt
index a902c91d23..7cd27e04fb 100644
--- a/tile_engine/ops/gemm/grouped_gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/grouped_gemm/CMakeLists.txt
@@ -226,7 +226,7 @@ message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx942, gfx950
 set(GROUPED_GEMM_GPU_TARGETS_INDIVIDUAL "")
-set(DESIRED_TARGETS "gfx942;gfx950")
+set(DESIRED_TARGETS "gfx942;gfx950;gfx12-generic")
 
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
@@ -237,7 +237,7 @@ endforeach()
 
 # Skip build if no matching targets found
 if(NOT GROUPED_GEMM_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine Grouped GEMM build: No supported GPU targets (gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine Grouped GEMM build: No supported GPU targets (gfx942, gfx950, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
     message(VERBOSE "Building individual Grouped GEMM targets for GPU targets: ${GROUPED_GEMM_GPU_TARGETS_INDIVIDUAL}")
 
diff --git a/tile_engine/ops/gemm_streamk/CMakeLists.txt b/tile_engine/ops/gemm_streamk/CMakeLists.txt
index ae453ea11b..8ddf6ce39d 100644
--- a/tile_engine/ops/gemm_streamk/CMakeLists.txt
+++ b/tile_engine/ops/gemm_streamk/CMakeLists.txt
@@ -2,8 +2,8 @@
 # SPDX-License-Identifier: MIT
 
 set(GEMM_STREAMK_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)")
-set(GEMM_STREAMK_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)")
-set(GEMM_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
+set(GEMM_STREAMK_LAYOUT "rcr;rrr;crr;ccr" CACHE STRING "List of layout for GEMM (semicolon-separated)")
+set(GEMM_STREAMK_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
 option(ENABLE_CCACHE_GEMM "Enable ccache for GEMM ops compilation" OFF)
 
 # Store the directory path for use in functions
@@ -116,23 +116,23 @@ function(build_individual_gemm_targets datatype layout)
     
     # Choose config file
     # Priority order:
-    # 1. Environment variable GEMM_CONFIG_FILE
-    # 2. CMake variable GEMM_CONFIG_FILE  
+    # 1. Environment variable GEMM_STREAMK_CONFIG_FILE
+    # 2. CMake variable GEMM_STREAMK_CONFIG_FILE  
     # 3. Default based on layout
     
     # Check environment variable first
-    if(DEFINED ENV{GEMM_CONFIG_FILE} AND NOT "$ENV{GEMM_CONFIG_FILE}" STREQUAL "")
-        set(config_filename "$ENV{GEMM_CONFIG_FILE}")
+    if(DEFINED ENV{GEMM_STREAMK_CONFIG_FILE} AND NOT "$ENV{GEMM_STREAMK_CONFIG_FILE}" STREQUAL "")
+        set(config_filename "$ENV{GEMM_STREAMK_CONFIG_FILE}")
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
-        message(STATUS "  Using config from environment variable: ${config_filename}")
-    elseif(NOT "${GEMM_CONFIG_FILE}" STREQUAL "")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
+    elseif(NOT "${GEMM_STREAMK_CONFIG_FILE}" STREQUAL "")
         # Use CMake variable if set
-        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_CONFIG_FILE}")
-        message(STATUS "  Using custom config: ${GEMM_CONFIG_FILE}")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_STREAMK_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${GEMM_STREAMK_CONFIG_FILE}")
     else()
         # Use default config for all layouts
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
-        message(STATUS "  Using default config for layout ${layout}")
+        message(VERBOSE "  Using default config for layout ${layout}")
     endif()
     
     # Check if config file exists
@@ -153,17 +153,17 @@ function(build_individual_gemm_targets datatype layout)
     endif()
     
     # Generate individual kernel files using parallel version
-    message(STATUS "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
-    message(STATUS "  Working path: ${working_path}")
-    message(STATUS "  Config file: ${json_blob}")
-    message(STATUS "  Python executable: ${Python3_EXECUTABLE}")
-    message(STATUS "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_streamk_instance_builder.py")
+    message(VERBOSE "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
+    message(VERBOSE "  Working path: ${working_path}")
+    message(VERBOSE "  Config file: ${json_blob}")
+    message(VERBOSE "  Python executable: ${Python3_EXECUTABLE}")
+    message(VERBOSE "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_streamk_instance_builder.py")
     
     # Create working directory first
     file(MAKE_DIRECTORY ${working_path})
     
     # First, just list the kernels (fast operation)
-    message(STATUS "  Listing kernel configurations...")
+    message(VERBOSE "  Listing kernel configurations...")
     execute_process(
         COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_streamk_instance_builder.py
                 --working_path ${working_path}
@@ -185,7 +185,7 @@ function(build_individual_gemm_targets datatype layout)
     if(EXISTS ${working_path}/gemm_kernel_count.txt)
         file(READ ${working_path}/gemm_kernel_count.txt kernel_count)
         string(STRIP "${kernel_count}" kernel_count)
-        message(STATUS "  Found ${kernel_count} kernel configurations")
+        message(VERBOSE "  Found ${kernel_count} kernel configurations")
     else()
         message(FATAL_ERROR "Kernel count file not found")
     endif()
@@ -209,10 +209,10 @@ function(build_individual_gemm_targets datatype layout)
 endfunction()
 
 # Main build logic - Only individual builds supported
-message(STATUS "=== Starting Tile Engine StreamK GEMM Configuration ===")
-message(STATUS "GEMM_STREAMK_DATATYPE: ${GEMM_STREAMK_DATATYPE}")
-message(STATUS "GEMM_STREAMK_LAYOUT: ${GEMM_STREAMK_LAYOUT}")
-message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+message(VERBOSE "=== Starting Tile Engine StreamK GEMM Configuration ===")
+message(VERBOSE "GEMM_STREAMK_DATATYPE: ${GEMM_STREAMK_DATATYPE}")
+message(VERBOSE "GEMM_STREAMK_LAYOUT: ${GEMM_STREAMK_LAYOUT}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 
 # Filter GPU targets to only gfx90a, gfx942
 set(GEMM_GPU_TARGETS_INDIVIDUAL "")
@@ -221,15 +221,15 @@ set(DESIRED_TARGETS "gfx90a;gfx942") # TODO: Add gfx950 when supported
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
     if(target IN_LIST DESIRED_TARGETS)
         list(APPEND GEMM_GPU_TARGETS_INDIVIDUAL ${target})
-        message(STATUS "  Adding GPU target: ${target}")
+        message(VERBOSE "  Adding GPU target: ${target}")
     endif()
 endforeach()
 
 # Skip build if no matching targets found
 if(NOT GEMM_GPU_TARGETS_INDIVIDUAL)
-    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
-    message(STATUS "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")
+    message(VERBOSE "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")
 
     # Enable parallel compilation optimizations
     # Set up job pools for better parallel compilation control
@@ -244,12 +244,12 @@ else()
         find_program(CCACHE_PROGRAM ccache)
         if(CCACHE_PROGRAM)
             set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-            message(STATUS "Using ccache for faster compilation")
+            message(VERBOSE "Using ccache for faster compilation")
         else()
             message(WARNING "ccache requested but not found")
         endif()
     else()
-        message(STATUS "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
+        message(VERBOSE "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
     endif()
 
     # Create master collection targets
diff --git a/tile_engine/ops/gemm_streamk/configs/default_config.json b/tile_engine/ops/gemm_streamk/configs/default_config.json
index 07281bdf9a..96c5571552 100644
--- a/tile_engine/ops/gemm_streamk/configs/default_config.json
+++ b/tile_engine/ops/gemm_streamk/configs/default_config.json
@@ -98,7 +98,7 @@
         },
         "reduction_strategy": {
             "values": [
-                "atomic"
+                "atomic", "linear", "tree"
             ]
         }
     }
diff --git a/tile_engine/ops/gemm_streamk/gemm_streamk_benchmark.py b/tile_engine/ops/gemm_streamk/gemm_streamk_benchmark.py
new file mode 100644
index 0000000000..ad8d9ff35c
--- /dev/null
+++ b/tile_engine/ops/gemm_streamk/gemm_streamk_benchmark.py
@@ -0,0 +1,676 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+
+class GemmBenchmark:
+    def __init__(self, build_dir: str, verbose: bool = False):
+        self.build_dir = Path(build_dir)
+        self.verbose = verbose
+        self.results = []
+
+    def discover_kernels(self) -> List[Path]:
+        """Find all benchmark_gemm_streamk_* executables in the build directory"""
+        bin_dir = self.build_dir / "bin"
+        if not bin_dir.exists():
+            print(f"Error: Binary directory {bin_dir} does not exist")
+            return []
+
+        kernels = list(bin_dir.glob("benchmark_gemm_streamk_*"))
+        if self.verbose:
+            print(f"Found {len(kernels)} kernel executables")
+            for k in kernels:
+                print(f"  - {k.name}")
+        return kernels
+
+    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
+        """Extract comprehensive kernel information from filename"""
+        name = kernel_path.stem
+
+        # Initialize with basic info
+        info = {
+            "executable": str(kernel_path),
+            "name": name,
+            "data_type": "unknown",
+            "layout": "unknown",
+            "pipeline": "unknown",
+            "scheduler": "unknown",
+            "epilogue": "unknown",
+            "reduction_strategy": "unknown",
+        }
+
+        # Parse the kernel name pattern:
+        # benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
+        parts = name.split("_")
+
+        if len(parts) >= 3:
+            # Extract data type (4th part after benchmark_gemm_streamk)
+            info["data_type"] = parts[3] if len(parts) > 3 else "unknown"
+
+            # Extract layout (5th part)
+            info["layout"] = parts[4] if len(parts) > 4 else "unknown"
+
+            # Extract pipeline (6th part)
+            info["pipeline"] = parts[5] if len(parts) > 5 else "unknown"
+
+            # Extract epilogue (7th part)
+            info["epilogue"] = parts[6] if len(parts) > 6 else "unknown"
+
+            # Extract scheduler (8th part)
+            info["scheduler"] = parts[7] if len(parts) > 7 else "unknown"
+            
+            # Extract reduction strategy (9th part)
+            info["reduction_strategy"] = parts[8] if len(parts) > 8 else "unknown"
+
+        # Extract detailed configuration from the end of the name
+        config_info = self.parse_detailed_config(name)
+        info.update(config_info)
+
+        # Generate config ID
+        info["config_id"] = self.generate_config_id(info)
+
+        return info
+
+    def parse_detailed_config(self, kernel_name: str) -> Dict:
+        """Parse detailed configuration from kernel name"""
+        config = {
+            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
+            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
+            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
+            "optimization_flags": {
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+            },
+        }
+
+        # Split by underscore and look for patterns
+        parts = kernel_name.split("_")
+
+        # Look for boolean flags (sequence of True/False values)
+        bool_sequence = []
+        for i, part in enumerate(parts):
+            if part in ["True", "False"]:
+                bool_sequence.append(part == "True")
+                # Continue collecting consecutive boolean values
+                j = i + 1
+                while j < len(parts) and parts[j] in ["True", "False"]:
+                    bool_sequence.append(parts[j] == "True")
+                    j += 1
+                break
+
+        # Assign boolean flags if we found them
+        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
+        if len(bool_sequence) >= 4:
+            config["optimization_flags"]["pad_m"] = bool_sequence[0]
+            config["optimization_flags"]["pad_n"] = bool_sequence[1]
+            config["optimization_flags"]["pad_k"] = bool_sequence[2]
+            config["optimization_flags"]["persistent"] = bool_sequence[3]
+
+        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
+        # The pattern is: tile_sizes_warp_config_warp_tile
+        dimension_groups = []
+        for part in parts:
+            if "x" in part and len(part.split("x")) == 3:
+                try:
+                    dims = [int(x) for x in part.split("x")]
+                    if all(d > 0 for d in dims):
+                        dimension_groups.append(dims)
+                except ValueError:
+                    continue
+
+        # Assign dimensions based on order and magnitude
+        if len(dimension_groups) >= 3:
+            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Largest dimensions = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smallest dimensions = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[2][0]
+            config["warp_config"]["warp_n"] = sorted_groups[2][1]
+            config["warp_config"]["warp_k"] = sorted_groups[2][2]
+
+            # Middle dimensions = warp tile
+            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
+            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
+            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 2:
+            # If only 2 groups, assign based on magnitude
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Larger = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smaller = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[1][0]
+            config["warp_config"]["warp_n"] = sorted_groups[1][1]
+            config["warp_config"]["warp_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 1:
+            # Only one group - assume it's tile sizes
+            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
+            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
+            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
+
+        return config
+
+    def generate_config_id(self, info: Dict) -> str:
+        """Generate a compact config ID from kernel info"""
+        # Create a compact identifier
+        parts = [
+            info.get("data_type", "unk"),
+            info.get("layout", "unk"),
+            info.get("pipeline", "unk"),
+            info.get("scheduler", "unk"),
+            info.get("reduction_strategy", "unk"),
+        ]
+
+        # Add tile configuration if available
+        tile_sizes = info.get("tile_sizes", {})
+        if tile_sizes.get("tile_m", 0) > 0:
+            tile_str = (
+                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
+            )
+            parts.append(tile_str)
+
+        # Add warp config if available
+        warp_config = info.get("warp_config", {})
+        if warp_config.get("warp_m", 0) > 0:
+            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
+            parts.append(warp_str)
+
+        # Add warp tile if available
+        warp_tile = info.get("warp_tile", {})
+        if warp_tile.get("warp_tile_m", 0) > 0:
+            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
+            parts.append(warp_tile_str)
+
+        return "_".join(parts)
+
+    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
+        """Run a single kernel with given parameters and save output to individual JSON file"""
+        # Create results directory
+        results_dir = self.build_dir / "results"
+        results_dir.mkdir(exist_ok=True)
+
+        # Generate unique JSON filename for this kernel
+        json_file = results_dir / f"{kernel_path.stem}.json"
+
+        cmd = [str(kernel_path)]
+
+        # Add parameters
+        for key, value in params.items():
+            cmd.append(f"-{key}={value}")
+
+        # Add JSON output flag for clean JSON output
+        cmd.append("-json_output=true")
+
+        if self.verbose:
+            print(f"Running: {' '.join(cmd)}")
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+
+            if result.returncode != 0:
+                print(f"Error running {kernel_path.name}: {result.stderr}")
+                return None
+
+            # Save raw output to individual JSON file
+            output = result.stdout.strip()
+            if output:
+                with open(json_file, "w") as f:
+                    f.write(output)
+
+                # Parse the JSON file
+                return self.parse_json_file(json_file)
+            else:
+                print(f"No output from {kernel_path.name}")
+                return None
+
+        except subprocess.TimeoutExpired:
+            print(f"Timeout running {kernel_path.name}")
+            return None
+        except Exception as e:
+            print(f"Error running {kernel_path.name}: {e}")
+            return None
+
+    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
+        """Parse JSON data from individual kernel output file"""
+        try:
+            with open(json_file, "r") as f:
+                content = f.read().strip()
+
+            # Parse the JSON directly since executables produce clean JSON
+            data = json.loads(content)
+
+            # Return the complete JSON data as-is, just add some convenience fields
+            result = data.copy()
+            if "perf_result" in data:
+                perf = data["perf_result"]
+                # Add convenience fields for backward compatibility
+                result["time_ms"] = perf.get("latency(ms)", 0)
+                result["tflops"] = perf.get("tflops(TFlops)", 0)
+                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
+
+            return result
+
+        except json.JSONDecodeError as e:
+            if self.verbose:
+                print(f"Failed to parse JSON from {json_file}: {e}")
+            return None
+        except Exception as e:
+            if self.verbose:
+                print(f"Error reading JSON file {json_file}: {e}")
+            return None
+
+    def benchmark_problem_size(
+        self,
+        kernels: List[Path],
+        m: int,
+        n: int,
+        k: int,
+        verify: int = 0,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> List[Dict]:
+        """Benchmark all kernels for a specific problem size"""
+        results = []
+
+        params = {
+            "m": m,
+            "n": n,
+            "k": k,
+            "verify": verify,
+            "warmup": warmup,
+            "repeat": repeat,
+            "flush_cache": str(flush_cache).lower(),
+            "rotating_count": rotating_count,
+        }
+
+        print(f"\nBenchmarking M={m}, N={n}, K={k}")
+
+        for kernel_path in kernels:
+            kernel_info = self.extract_kernel_info(kernel_path)
+            result = self.run_kernel(kernel_path, params)
+
+            if result:
+                # Create new structured result format
+                structured_result = {
+                    "name": kernel_info["name"],  # Add name field for compatibility
+                    "config_id": kernel_info["config_id"],
+                    "problem": result.get("problem", {}),
+                    "perf_result": result.get("perf_result", {}),
+                    "config": {
+                        "data_type": kernel_info["data_type"],
+                        "layout": kernel_info["layout"],
+                        "pipeline": kernel_info["pipeline"],
+                        "scheduler": kernel_info["scheduler"],
+                        "epilogue": kernel_info["epilogue"],
+                        "reduction_strategy": kernel_info["reduction_strategy"],
+                        "tile_sizes": kernel_info.get("tile_sizes", {}),
+                        "warp_config": kernel_info.get("warp_config", {}),
+                        "warp_tile": kernel_info.get("warp_tile", {}),
+                        "optimization_flags": kernel_info.get("optimization_flags", {}),
+                    },
+                    "executable": kernel_info["executable"],
+                    # Keep backward compatibility fields
+                    "time_ms": result.get("time_ms", 0),
+                    "tflops": result.get("tflops", 0),
+                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
+                }
+
+                results.append(structured_result)
+
+                if self.verbose:
+                    print(
+                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
+                    )
+
+        return results
+
+    def find_best_kernel(
+        self, results: List[Dict], metric: str = "tflops"
+    ) -> Optional[Dict]:
+        """Find the best performing kernel based on metric"""
+        if not results:
+            return None
+
+        if metric == "tflops":
+            return max(results, key=lambda x: x.get("tflops", 0))
+        elif metric == "time_ms":
+            return min(results, key=lambda x: x.get("time_ms", float("inf")))
+        elif metric == "bandwidth_gb_s":
+            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
+        else:
+            raise ValueError(f"Unknown metric: {metric}")
+
+    def benchmark_sweep(
+        self,
+        problem_sizes: List[Tuple[int, int, int]],
+        verify: bool = False,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> Dict:
+        """Run comprehensive benchmark sweep"""
+        kernels = self.discover_kernels()
+        if not kernels:
+            print("No kernels found!")
+            return {}
+
+        all_results = []
+        best_kernels = {}
+
+        for m, n, k in problem_sizes:
+            results = self.benchmark_problem_size(
+                kernels,
+                m,
+                n,                    
+                k,
+                verify=2 if verify else 0,
+                warmup=warmup,
+                repeat=repeat,
+                flush_cache=flush_cache,
+                rotating_count=rotating_count,
+            )
+
+            all_results.extend(results)
+
+            # Find best kernel for this configuration
+            best = self.find_best_kernel(results)
+            if best:
+                key = f"m{m}_n{n}_k{k}"
+                best_kernels[key] = best
+                print(
+                    f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
+                )
+
+        self.results = all_results
+        return best_kernels
+
+    def export_csv(self, filename: str):
+        """Export all results to CSV"""
+        if not self.results:
+            print("No results to export")
+            return
+
+        # Get all unique keys from results
+        all_keys = set()
+        for result in self.results:
+            all_keys.update(result.keys())
+
+        # Sort keys for consistent output
+        fieldnames = sorted(all_keys)
+
+        with open(filename, "w", newline="") as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(self.results)
+
+        print(f"Results exported to {filename}")
+
+    def export_best_kernels(self, best_kernels: Dict, filename: str):
+        """Export best kernel selections to file"""
+        with open(filename, "w") as f:
+            f.write("# Best kernel selections\n")
+            f.write(
+                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
+            )
+
+            for key, kernel in sorted(best_kernels.items()):
+                f.write(
+                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
+                )
+
+        print(f"Best kernels exported to {filename}")
+
+    def export_json(self, filename: str, best_kernels: Dict = None):
+        """Export all results and best kernels to JSON with comprehensive metadata"""
+        from datetime import datetime
+
+        # Calculate comprehensive summary statistics for all metrics
+        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
+
+        tflops_values = [r.get("tflops", 0) for r in successful_results]
+        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
+        latency_values = [
+            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
+        ]
+
+        # Performance breakdown by kernel type
+        pipeline_stats = {}
+        scheduler_stats = {}
+        data_type_stats = {}
+
+        for result in successful_results:
+            # Get config info from the new structure
+            config = result.get("config", {})
+
+            # Pipeline statistics
+            pipeline = config.get("pipeline", "unknown")
+            if pipeline not in pipeline_stats:
+                pipeline_stats[pipeline] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            pipeline_stats[pipeline]["count"] += 1
+            pipeline_stats[pipeline]["best_tflops"] = max(
+                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
+            )
+
+            # Scheduler statistics
+            scheduler = config.get("scheduler", "unknown")
+            if scheduler not in scheduler_stats:
+                scheduler_stats[scheduler] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            scheduler_stats[scheduler]["count"] += 1
+            scheduler_stats[scheduler]["best_tflops"] = max(
+                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
+            )
+
+            # Data type statistics
+            data_type = config.get("data_type", "unknown")
+            if data_type not in data_type_stats:
+                data_type_stats[data_type] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            data_type_stats[data_type]["count"] += 1
+            data_type_stats[data_type]["best_tflops"] = max(
+                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
+            )
+
+        # Calculate averages for breakdown stats
+        for stats_dict, field_name in [
+            (pipeline_stats, "pipeline"),
+            (scheduler_stats, "scheduler"),
+            (data_type_stats, "data_type"),
+        ]:
+            for key in stats_dict:
+                relevant_results = [
+                    r
+                    for r in successful_results
+                    if r.get("config", {}).get(field_name, "unknown") == key
+                ]
+                if relevant_results:
+                    stats_dict[key]["avg_tflops"] = sum(
+                        r.get("tflops", 0) for r in relevant_results
+                    ) / len(relevant_results)
+
+        output_data = {
+            "benchmark_metadata": {
+                "timestamp": datetime.now().isoformat(),
+                "total_kernels_tested": len(self.results),
+                "unique_kernels": len(
+                    set(r.get("name", "unknown") for r in self.results)
+                ),
+                "successful_runs": len(successful_results),
+                "failed_runs": len(self.results) - len(successful_results),
+            },
+            "performance_summary": {
+                "tflops_stats": {
+                    "best": max(tflops_values, default=0),
+                    "average": sum(tflops_values) / len(tflops_values)
+                    if tflops_values
+                    else 0,
+                    "min": min(tflops_values, default=0),
+                    "median": sorted(tflops_values)[len(tflops_values) // 2]
+                    if tflops_values
+                    else 0,
+                },
+                "bandwidth_stats": {
+                    "best_gb_s": max(bandwidth_values, default=0),
+                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
+                    if bandwidth_values
+                    else 0,
+                    "min_gb_s": min(bandwidth_values, default=0),
+                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
+                    if bandwidth_values
+                    else 0,
+                },
+                "latency_stats": {
+                    "best_ms": min(latency_values, default=0),
+                    "average_ms": sum(latency_values) / len(latency_values)
+                    if latency_values
+                    else 0,
+                    "max_ms": max(latency_values, default=0),
+                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
+                    if latency_values
+                    else 0,
+                },
+                "kernel_type_breakdown": {
+                    "by_pipeline": pipeline_stats,
+                    "by_scheduler": scheduler_stats,
+                    "by_data_type": data_type_stats,
+                },
+                "total_problem_configurations": len(best_kernels)
+                if best_kernels
+                else 0,
+            },
+            "kernel_results": self.results,
+            "best_kernels_by_problem": best_kernels or {},
+        }
+
+        with open(filename, "w") as f:
+            json.dump(output_data, f, indent=2)
+
+        print(f"JSON results exported to {filename}")
+        print(f"  - Total kernels: {len(self.results)}")
+        print(f"  - Successful runs: {len(successful_results)}")
+        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
+        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
+        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
+    parser.add_argument(
+        "build_dir", help="Build directory containing kernel executables"
+    )
+    parser.add_argument(
+        "--problem-sizes",
+        nargs="+",
+        default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"],
+        help="Problem sizes as M,N,K tuples",
+    )
+    parser.add_argument("--verify", action="store_true", help="Enable verification")
+    parser.add_argument(
+        "--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
+    )
+    parser.add_argument(
+        "--best", default="best_kernels.txt", help="Best kernels output filename"
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=50,
+        help="Number of warmup iterations (default: 50)",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations (default: 100)",
+    )
+    parser.add_argument(
+        "--no-flush-cache",
+        dest="flush_cache",
+        action="store_false",
+        default=True,
+        help="Disable cache flushing (default: enabled)",
+    )
+    parser.add_argument(
+        "--rotating-count",
+        type=int,
+        default=1000,
+        help="Number of iterations to rotate cache (default: 1000)",
+    )
+    parser.add_argument("--json", help="JSON output filename (optional)")
+
+    args = parser.parse_args()
+
+    # Parse problem sizes
+    problem_sizes = []
+    for size_str in args.problem_sizes:
+        try:
+            m, n, k = map(int, size_str.split(","))
+            problem_sizes.append((m, n, k))
+        except ValueError:
+            print(f"Invalid problem size: {size_str}")
+            return 1
+
+    # Create benchmark instance
+    benchmark = GemmBenchmark(args.build_dir, verbose=args.verbose)
+
+    # Run benchmark sweep
+    print("Starting GEMM kernel benchmark sweep...")
+    start_time = time.time()
+
+    best_kernels = benchmark.benchmark_sweep(
+        problem_sizes=problem_sizes,
+        verify=args.verify,
+        warmup=args.warmup,
+        repeat=args.repeat,
+        flush_cache=args.flush_cache,
+        rotating_count=args.rotating_count,
+    )
+
+    elapsed_time = time.time() - start_time
+    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
+
+    # Export results
+    benchmark.export_csv(args.csv)
+    benchmark.export_best_kernels(best_kernels, args.best)
+
+    # Export JSON if requested
+    if args.json:
+        benchmark.export_json(args.json, best_kernels)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py b/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py
index 5c87d6f50c..8fd422e6b8 100644
--- a/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py
+++ b/tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py
@@ -436,19 +436,18 @@ struct SelectedKernel {{
     static constexpr ck_tile::index_t WarpTileK = {tile_config["warp_tile_k"]};
 
     // Traits
-    static constexpr bool kPadM = {"true" if pad_m == "true" else "false"};
-    static constexpr bool kPadN = {"true" if pad_n == "true" else "false"};
-    static constexpr bool kPadK = {"true" if pad_k == "true" else "false"};
+    static constexpr bool kPadM = {"true" if str(pad_m).lower() == "true" else "false"};
+    static constexpr bool kPadN = {"true" if str(pad_n).lower() == "true" else "false"};
+    static constexpr bool kPadK = {"true" if str(pad_k).lower() == "true" else "false"};
     static constexpr bool Preshuffle = false;
 
-    static constexpr bool DoubleSmemBuffer = {"true" if pipeline == "compv4" else "false"};
+    static constexpr bool DoubleSmemBuffer = {"true" if str(pipeline).lower() == "compv4" else "false"};
     static constexpr int kBlockPerCu       = 1;
     static constexpr bool StructuredSparsity = false;
     static constexpr bool NumWaveGroup       = 1;
 
     static constexpr bool TransposeC = false;
     static constexpr bool UsePersistentKernel = {"true" if str(persistent).lower() == "true" else "false"};
-    static constexpr bool UseStructuredSparsity = false;
     static constexpr ck_tile::index_t NumWaveGroups = 1;
     static constexpr ck_tile::StreamKReductionStrategy reduction_strategy = {reduction_strategy_map.get(reduction_strategy, "ck_tile::StreamKReductionStrategy::Linear")};
 
@@ -697,11 +696,11 @@ struct SelectedKernel {{
                     pipeline,
                     epilogue,
                     scheduler,
+                    reduction_strategy,
                     pad_m,
                     pad_n,
                     pad_k,
                     persistent,
-                    reduction_strategy,
                 ) = trait_combo
 
                 # Create kernel name with proper boolean capitalization
@@ -873,10 +872,10 @@ def main():
             trait_parts[1],  # epilogue
             trait_parts[2],  # scheduler
             trait_parts[3],  # reduction_strategy
-            trait_parts[4] == "false",  # pad_m
-            trait_parts[5] == "false",  # pad_n
-            trait_parts[6] == "false",  # pad_k
-            trait_parts[7],  # persistent
+            str(trait_parts[4]).lower() == "true",  # pad_m
+            str(trait_parts[5]).lower() == "true",  # pad_n
+            str(trait_parts[6]).lower() == "true",  # pad_k
+            str(trait_parts[7]).lower() == "true",  # persistent
         )
 
         # Generate the kernel
diff --git a/tile_engine/ops/pooling/CMakeLists.txt b/tile_engine/ops/pooling/CMakeLists.txt
new file mode 100644
index 0000000000..c7a47f0558
--- /dev/null
+++ b/tile_engine/ops/pooling/CMakeLists.txt
@@ -0,0 +1,213 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# ============================================================================
+# Pooling Tile Engine Build Configuration
+#
+# Generates individual benchmark executables for pooling kernels
+# ============================================================================
+
+set(POOLING_DATATYPE "fp8;fp16;fp32" CACHE STRING "List of datatypes for Pooling (semicolon-separated)")
+set(POOLING_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
+option(ENABLE_CCACHE_POOLING "Enable ccache for pooling ops compilation" OFF)
+
+# Store the directory path for use in functions
+set(POOLING_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR})
+
+# ============================================================================
+# create_individual_pool_target
+#
+# Creates a single benchmark executable for a specific pooling kernel config.
+# ============================================================================
+function(create_individual_pool_target datatype kernel_name trait tile_config config_json)
+    if(NOT POOLING_GPU_TARGETS)
+        message(WARNING "Skipping individual pooling target: No supported GPU targets")
+        return()
+    endif()
+
+    set(target_name "benchmark_pooling_${datatype}_${trait}_${tile_config}")
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}")
+    # HIP clang offload uses temporary files derived from the input source basename.
+    # When many targets compile the same source filename in parallel, temporary
+    # files can collide and corrupt each other. Use a unique copied source per target.
+    set(target_source "${CMAKE_CURRENT_BINARY_DIR}/${target_name}_pooling_benchmark_single.cpp")
+
+    # Generated header path - use kernel_name from pool_kernel_list.txt to match
+    # the filename generated by pooling_instance_builder.py
+    set(instance_header "${working_path}/pooling_single_${kernel_name}.hpp")
+
+    # Add custom command to generate the header file at build time
+    add_custom_command(
+        OUTPUT ${instance_header}
+        COMMAND ${Python3_EXECUTABLE} ${POOLING_SOURCE_DIR}/pooling_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --config_json ${config_json}
+                --gen_single
+                --kernel_name "${kernel_name}"
+                --tile_config "${tile_config}"
+                --trait_combo "${trait}"
+        DEPENDS ${POOLING_SOURCE_DIR}/pooling_instance_builder.py ${config_json}
+        COMMENT "Generating ${instance_header}"
+    )
+
+    configure_file(${POOLING_SOURCE_DIR}/pooling_benchmark_single.cpp ${target_source} COPYONLY)
+
+    # Create the executable
+    add_executable(${target_name}
+        EXCLUDE_FROM_ALL
+        ${target_source}
+        ${instance_header}
+    )
+
+    # Set GPU architectures
+    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${POOLING_GPU_TARGETS})
+
+    # Set compile definitions
+    target_compile_definitions(${target_name} PRIVATE
+        POOLING_SINGLE_INSTANCE_HPP="${instance_header}"
+    )
+
+    # Include directories
+    target_include_directories(${target_name} PRIVATE
+        ${POOLING_SOURCE_DIR}
+        ${working_path}
+    )
+
+    # Compile options
+    target_compile_options(${target_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+        -include ${instance_header}
+    )
+
+    # Add FP8 format definitions if needed
+    if(CK_USE_OCP_FP8)
+        target_compile_options(${target_name} PRIVATE -DCK_TILE_USE_OCP_FP8)
+    endif()
+
+    # Add to collection targets
+    add_dependencies(benchmark_pooling_all ${target_name})
+    add_dependencies(benchmark_pooling_${datatype} ${target_name})
+
+    message(DEBUG "  Created pooling benchmark target: ${target_name}")
+endfunction()
+
+# ============================================================================
+# build_individual_pool_targets
+#
+# Builds all benchmark targets for a specific datatype.
+# ============================================================================
+function(build_individual_pool_targets datatype)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}")
+
+    # Choose config file
+    if(DEFINED ENV{POOLING_CONFIG_FILE} AND NOT "$ENV{POOLING_CONFIG_FILE}" STREQUAL "")
+        set(config_filename "$ENV{POOLING_CONFIG_FILE}")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
+    elseif(NOT "${POOLING_CONFIG_FILE}" STREQUAL "")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${POOLING_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${POOLING_CONFIG_FILE}")
+    else()
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+        message(VERBOSE "  Using default config for pooling")
+    endif()
+
+    if(NOT EXISTS ${json_blob})
+        message(FATAL_ERROR "Config file not found: ${json_blob}")
+    endif()
+
+    file(MAKE_DIRECTORY ${working_path})
+
+    # Step 1: List kernels
+    message(VERBOSE "  Listing pooling kernel configurations for ${datatype}...")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/pooling_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --config_json ${json_blob}
+                --list_kernels
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        RESULT_VARIABLE ret
+        OUTPUT_VARIABLE list_output
+        ERROR_VARIABLE list_error
+    )
+
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list pooling kernels for ${datatype}: ${list_error}")
+    endif()
+
+    # Read kernel count
+    if(EXISTS ${working_path}/pool_kernel_count.txt)
+        file(READ ${working_path}/pool_kernel_count.txt kernel_count)
+        string(STRIP "${kernel_count}" kernel_count)
+        message(VERBOSE "  Found ${kernel_count} pooling kernel configurations")
+    else()
+        message(FATAL_ERROR "Pooling kernel count file not found")
+    endif()
+
+    # Step 2: Create targets
+    if(EXISTS ${working_path}/pool_kernel_list.txt)
+        file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines)
+        foreach(line IN LISTS kernel_lines)
+            string(REPLACE "|" ";" parts "${line}")
+            list(LENGTH parts parts_len)
+            if(parts_len EQUAL 3)
+                list(GET parts 0 kernel_name)
+                list(GET parts 1 tile_config)
+                list(GET parts 2 trait_combo)
+                create_individual_pool_target("${datatype}" "${kernel_name}" "${trait_combo}" "${tile_config}" "${json_blob}")
+            endif()
+        endforeach()
+    else()
+        message(FATAL_ERROR "Pooling kernel list file not found")
+    endif()
+endfunction()
+
+# ============================================================================
+# MAIN EXECUTION
+# ============================================================================
+
+message(VERBOSE "=== Starting Tile Engine Pooling Configuration ===")
+message(VERBOSE "POOLING_DATATYPE: ${POOLING_DATATYPE}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+
+# Filter GPU targets
+set(POOLING_GPU_TARGETS "")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+
+foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+    if(target IN_LIST DESIRED_TARGETS)
+        list(APPEND POOLING_GPU_TARGETS ${target})
+        message(VERBOSE "  Adding GPU target for pooling: ${target}")
+    endif()
+endforeach()
+
+if(NOT POOLING_GPU_TARGETS)
+    message(WARNING "Skipping Tile Engine Pooling build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+else()
+    message(VERBOSE "Building pooling targets for GPU targets: ${POOLING_GPU_TARGETS}")
+
+    # Enable ccache if requested
+    if(ENABLE_CCACHE_POOLING)
+        find_program(CCACHE_PROGRAM ccache)
+        if(CCACHE_PROGRAM)
+            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+            message(VERBOSE "Using ccache for pooling compilation")
+        endif()
+    endif()
+
+    # Create collection targets
+    add_custom_target(benchmark_pooling_all)
+
+    foreach(dt IN LISTS POOLING_DATATYPE)
+        add_custom_target(benchmark_pooling_${dt})
+    endforeach()
+
+    # Build targets for each datatype
+    foreach(dt IN LISTS POOLING_DATATYPE)
+        build_individual_pool_targets(${dt})
+    endforeach()
+endif()
diff --git a/tile_engine/ops/pooling/configs/default_config.json b/tile_engine/ops/pooling/configs/default_config.json
new file mode 100644
index 0000000000..0104dbd9f7
--- /dev/null
+++ b/tile_engine/ops/pooling/configs/default_config.json
@@ -0,0 +1,21 @@
+{
+    "problem": {
+        "description": "Default pooling configuration for tile_engine benchmarks"
+    },
+    "tile_config": {
+        "block_m": {"values": [64,128,256]},
+        "block_n": {"values": [1,2]},
+        "warp_m": {"values": [1]},
+        "warp_n": {"values": [1]},
+        "warp_tile_m": {"values": [128]},
+        "warp_tile_n": {"values": [1]},
+        "thread_tile_m": {"values": [1,2,4]},
+        "thread_tile_n": {"values": [1]}
+    },
+    "trait_config": {
+        "reduce_op": {"values": ["max", "min", "avg"]},
+        "output_index": {"values": [true, false]},
+        "propagate_nan": {"values": [true, false]},
+        "pooling_dim": {"values": ["2d", "3d"]}
+    }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/pooling/pooling_benchmark.hpp b/tile_engine/ops/pooling/pooling_benchmark.hpp
new file mode 100644
index 0000000000..09073cdbd1
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_benchmark.hpp
@@ -0,0 +1,132 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <cmath>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/host/reference/reference_pool.hpp"
+
+namespace ck_tile {
+
+/// @brief Performance metrics for benchmarking
+enum class PoolMetric
+{
+    LATENCY,
+    BANDWIDTH
+};
+
+/// @brief Pooling problem specification for 2D pooling
+struct PoolProblem2D
+{
+    index_t N, H, W, C;              // Input dimensions (NHWC)
+    index_t Y, X;                    // Window dimensions
+    index_t stride_h, stride_w;      // Window strides
+    index_t dilation_h, dilation_w;  // Window dilations
+    index_t pad_h_left, pad_h_right; // Height padding
+    index_t pad_w_left, pad_w_right; // Width padding
+    std::string datatype;            // Data type name
+    std::string reduce_op;           // "max", "min", or "avg"
+
+    index_t Ho() const
+    {
+        index_t Ys = (Y - 1) * dilation_h + 1;
+        return (H + pad_h_left + pad_h_right - Ys) / stride_h + 1;
+    }
+
+    index_t Wo() const
+    {
+        index_t Xs = (X - 1) * dilation_w + 1;
+        return (W + pad_w_left + pad_w_right - Xs) / stride_w + 1;
+    }
+
+    index_t input_elements() const { return N * H * W * C; }
+    index_t output_elements() const { return N * Ho() * Wo() * C; }
+
+    std::string to_string() const
+    {
+        std::ostringstream oss;
+        oss << "N" << N << "_H" << H << "_W" << W << "_C" << C << "_Y" << Y << "_X" << X << "_Sh"
+            << stride_h << "_Sw" << stride_w << "_Dh" << dilation_h << "_Dw" << dilation_w;
+        if(pad_h_left > 0 || pad_w_left > 0)
+            oss << "_Ph" << pad_h_left << "_Pw" << pad_w_left;
+        return oss.str();
+    }
+};
+
+/// @brief Pooling problem specification for 3D pooling
+struct PoolProblem3D
+{
+    index_t N, D, H, W, C;                      // Input dimensions (NDHWC)
+    index_t Z, Y, X;                            // Window dimensions
+    index_t stride_d, stride_h, stride_w;       // Window strides
+    index_t dilation_d, dilation_h, dilation_w; // Window dilations
+    index_t pad_d_left, pad_d_right;            // Depth padding
+    index_t pad_h_left, pad_h_right;            // Height padding
+    index_t pad_w_left, pad_w_right;            // Width padding
+    std::string datatype;                       // Data type name
+    std::string reduce_op;                      // "max", "min", or "avg"
+
+    index_t Do() const
+    {
+        index_t Zs = (Z - 1) * dilation_d + 1;
+        return (D + pad_d_left + pad_d_right - Zs) / stride_d + 1;
+    }
+
+    index_t Ho() const
+    {
+        index_t Ys = (Y - 1) * dilation_h + 1;
+        return (H + pad_h_left + pad_h_right - Ys) / stride_h + 1;
+    }
+
+    index_t Wo() const
+    {
+        index_t Xs = (X - 1) * dilation_w + 1;
+        return (W + pad_w_left + pad_w_right - Xs) / stride_w + 1;
+    }
+
+    index_t input_elements() const { return N * D * H * W * C; }
+    index_t output_elements() const { return N * Do() * Ho() * Wo() * C; }
+
+    std::string to_string() const
+    {
+        std::ostringstream oss;
+        oss << "N" << N << "_D" << D << "_H" << H << "_W" << W << "_C" << C << "_Z" << Z << "_Y"
+            << Y << "_X" << X;
+        return oss.str();
+    }
+};
+
+/// @brief Performance result for a pooling kernel
+struct PoolPerformanceResult
+{
+    float latency_ms;
+    float bandwidth_gb_s;
+
+    std::string to_string() const
+    {
+        std::ostringstream oss;
+        oss << "latency=" << latency_ms << "ms, bandwidth=" << bandwidth_gb_s << "GB/s";
+        return oss.str();
+    }
+};
+
+/// @brief Benchmark settings
+struct PoolBenchmarkSetting
+{
+    int warmup      = 5;
+    int repeat      = 20;
+    bool verify     = true;
+    int init_method = 0; // 0: uniform random, 1: integer sequence, 2: constant, 3: special
+};
+
+} // namespace ck_tile
diff --git a/tile_engine/ops/pooling/pooling_benchmark_single.cpp b/tile_engine/ops/pooling/pooling_benchmark_single.cpp
new file mode 100644
index 0000000000..0d872a9f51
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_benchmark_single.cpp
@@ -0,0 +1,390 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file pooling_benchmark_single.cpp
+ * @brief Single-kernel benchmark for pooling operations (2D and 3D).
+ *
+ * This benchmark includes the generated kernel header via -include flag
+ * and runs the pooling kernel with specified problem sizes.
+ *
+ * The generated header provides:
+ *   - SelectedKernel   (struct with ::launch())
+ *   - KERNEL_NAME      (constexpr const char*)
+ *   - POOLING_DIM      (constexpr int, 2 or 3)
+ *   - InDataType, OutDataType, ComputeDataType, IndexDataType, ReduceOpType
+ *   - TensorShape, WindowShape
+ */
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/host/reference/reference_pool.hpp"
+#include "pooling_common.hpp"
+#include "pooling_benchmark.hpp"
+
+// The kernel header is included via compile command line with -include flag.
+
+// --------------------------------------------------------------------------
+// Benchmark implementation — templated on pooling dimension so that only
+// the matching branch is instantiated (2D or 3D).
+// --------------------------------------------------------------------------
+
+template <typename HostArgs>
+static float launch_selected_kernel(HostArgs& args, const ck_tile::stream_config& stream)
+{
+    return SelectedKernel::launch(args, stream);
+}
+
+template <int PoolDim>
+static int benchmark_pooling(int argc, char* argv[])
+{
+    if constexpr(PoolDim == 2)
+    {
+        // ---- 2D argument parser ----
+        ck_tile::ArgParser arg_parser;
+        arg_parser.insert("n", "1", "Batch size (N)")
+            .insert("h", "16", "Input height (H)")
+            .insert("w", "16", "Input width (W)")
+            .insert("c", "32", "Channels (C)")
+            .insert("wy", "2", "Window height (Y)")
+            .insert("wx", "2", "Window width (X)")
+            .insert("sy", "2", "Window stride height")
+            .insert("sx", "2", "Window stride width")
+            .insert("dy", "1", "Window dilation height")
+            .insert("dx", "1", "Window dilation width")
+            .insert("phy", "0", "Padding height left")
+            .insert("phyr", "0", "Padding height right")
+            .insert("pwx", "0", "Padding width left")
+            .insert("pwxr", "0", "Padding width right")
+            .insert("verify", "1", "Verify results (0/1)")
+            .insert("warmup", "5", "Warmup iterations")
+            .insert("repeat", "20", "Repeat iterations")
+            .insert("log", "1", "Log level");
+
+        if(!arg_parser.parse(argc, argv))
+            return -1;
+
+        ck_tile::index_t N       = arg_parser.get_int("n");
+        ck_tile::index_t H       = arg_parser.get_int("h");
+        ck_tile::index_t W       = arg_parser.get_int("w");
+        ck_tile::index_t C       = arg_parser.get_int("c");
+        ck_tile::index_t Y       = arg_parser.get_int("wy");
+        ck_tile::index_t X       = arg_parser.get_int("wx");
+        ck_tile::index_t Sy      = arg_parser.get_int("sy");
+        ck_tile::index_t Sx      = arg_parser.get_int("sx");
+        ck_tile::index_t Dy      = arg_parser.get_int("dy");
+        ck_tile::index_t Dx      = arg_parser.get_int("dx");
+        ck_tile::index_t LeftPy  = arg_parser.get_int("phy");
+        ck_tile::index_t RightPy = arg_parser.get_int("phyr");
+        ck_tile::index_t LeftPx  = arg_parser.get_int("pwx");
+        ck_tile::index_t RightPx = arg_parser.get_int("pwxr");
+
+        bool verify   = arg_parser.get_int("verify") != 0;
+        int warmup    = arg_parser.get_int("warmup");
+        int repeat    = arg_parser.get_int("repeat");
+        int log_level = arg_parser.get_int("log");
+
+        ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        ck_tile::index_t Xs = (X - 1) * Dx + 1;
+        ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
+        ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
+
+        std::cout << "Pooling 2D benchmark: " << KERNEL_NAME << std::endl;
+        std::cout << "  Input:  NHWC = " << N << "x" << H << "x" << W << "x" << C << std::endl;
+        std::cout << "  Output: NHWC = " << N << "x" << Ho << "x" << Wo << "x" << C << std::endl;
+        std::cout << "  Window: " << Y << "x" << X << ", stride: " << Sy << "x" << Sx
+                  << ", dilation: " << Dy << "x" << Dx << std::endl;
+
+        ck_tile::HostTensor<InDataType> h_in({N, H, W, C});
+        ck_tile::HostTensor<OutDataType> h_out({N, Ho, Wo, C});
+        ck_tile::HostTensor<OutDataType> h_out_ref({N, Ho, Wo, C});
+        ck_tile::HostTensor<IndexDataType> h_out_index({N, Ho, Wo, C});
+        ck_tile::HostTensor<IndexDataType> h_out_ref_index({N, Ho, Wo, C});
+
+        ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(h_in);
+
+        ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes());
+
+        d_in.ToDevice(h_in.data());
+        d_out.SetZero();
+        d_out_index.SetZero();
+
+        auto input_shape      = ck_tile::make_tuple(N, H, W, C);
+        auto output_shape     = ck_tile::make_tuple(N, Ho, Wo, C);
+        auto input_strides    = ck_tile::make_tuple(H * W * C, W * C, C, ck_tile::index_t{1});
+        auto output_strides   = ck_tile::make_tuple(Ho * Wo * C, Wo * C, C, ck_tile::index_t{1});
+        auto window_lengths   = ck_tile::make_tuple(Y, X);
+        auto window_strides   = ck_tile::make_tuple(Sy, Sx);
+        auto window_dilations = ck_tile::make_tuple(Dy, Dx);
+        auto input_left_pads  = ck_tile::make_tuple(LeftPy, LeftPx);
+        auto input_right_pads = ck_tile::make_tuple(RightPy, RightPx);
+
+        auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+            d_in.GetDeviceBuffer(),
+            d_out.GetDeviceBuffer(),
+            d_out_index.GetDeviceBuffer(),
+            input_shape,
+            output_shape,
+            input_strides,
+            output_strides,
+            window_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads};
+
+        ck_tile::stream_config stream{nullptr, true, log_level, warmup, repeat};
+
+        float latency = 0;
+        try
+        {
+            latency = launch_selected_kernel(host_args, stream);
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << "Kernel launch failed: " << e.what() << std::endl;
+            return -1;
+        }
+
+        size_t bytes_read    = static_cast<size_t>(N) * H * W * C * sizeof(InDataType);
+        size_t bytes_written = static_cast<size_t>(N) * Ho * Wo * C * sizeof(OutDataType);
+        float bandwidth      = (bytes_read + bytes_written) / (latency * 1e-3f) / 1e9f;
+
+        std::cout << "  Latency: " << latency << " ms" << std::endl;
+        std::cout << "  Bandwidth: " << bandwidth << " GB/s" << std::endl;
+
+        if(verify)
+        {
+            d_out.FromDevice(h_out.data());
+            d_out_index.FromDevice(h_out_index.data());
+
+            auto kernel_args =
+                ck_tile::PoolKernelArgs<decltype(input_shape), decltype(window_lengths)>{
+                    h_in.data(),
+                    h_out_ref.data(),
+                    h_out_ref_index.data(),
+                    input_shape,
+                    output_shape,
+                    input_strides,
+                    output_strides,
+                    window_lengths,
+                    window_strides,
+                    window_dilations,
+                    input_left_pads,
+                    input_right_pads};
+
+            ck_tile::reference_pool2d<InDataType,
+                                      ComputeDataType,
+                                      OutDataType,
+                                      IndexDataType,
+                                      ReduceOpType,
+                                      decltype(input_shape),
+                                      decltype(window_lengths),
+                                      SelectedKernel::kOutputIndex>(
+                h_in, h_out_ref, h_out_ref_index, kernel_args, ReduceOpType{});
+
+            bool pass_value =
+                ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-3, 1e-3);
+            std::cout << "  Verification: " << (pass_value ? "PASS" : "FAIL") << std::endl;
+
+            if(SelectedKernel::kOutputIndex)
+            {
+                bool pass_index = ck_tile::check_err(
+                    h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0);
+                std::cout << "  Index verification: " << (pass_index ? "PASS" : "FAIL")
+                          << std::endl;
+            }
+        }
+
+        return 0;
+    }
+    else // PoolDim == 3
+    {
+        // ---- 3D argument parser ----
+        ck_tile::ArgParser arg_parser;
+        arg_parser.insert("n", "1", "Batch size (N)")
+            .insert("d", "4", "Input depth (D)")
+            .insert("h", "16", "Input height (H)")
+            .insert("w", "16", "Input width (W)")
+            .insert("c", "32", "Channels (C)")
+            .insert("wz", "2", "Window depth (Z)")
+            .insert("wy", "2", "Window height (Y)")
+            .insert("wx", "2", "Window width (X)")
+            .insert("sz", "2", "Window stride depth")
+            .insert("sy", "2", "Window stride height")
+            .insert("sx", "2", "Window stride width")
+            .insert("dz", "1", "Window dilation depth")
+            .insert("dy", "1", "Window dilation height")
+            .insert("dx", "1", "Window dilation width")
+            .insert("pdz", "0", "Padding depth left")
+            .insert("pdzr", "0", "Padding depth right")
+            .insert("phy", "0", "Padding height left")
+            .insert("phyr", "0", "Padding height right")
+            .insert("pwx", "0", "Padding width left")
+            .insert("pwxr", "0", "Padding width right")
+            .insert("verify", "1", "Verify results (0/1)")
+            .insert("warmup", "5", "Warmup iterations")
+            .insert("repeat", "20", "Repeat iterations")
+            .insert("log", "1", "Log level");
+
+        if(!arg_parser.parse(argc, argv))
+            return -1;
+
+        ck_tile::index_t N       = arg_parser.get_int("n");
+        ck_tile::index_t D       = arg_parser.get_int("d");
+        ck_tile::index_t H       = arg_parser.get_int("h");
+        ck_tile::index_t W       = arg_parser.get_int("w");
+        ck_tile::index_t C       = arg_parser.get_int("c");
+        ck_tile::index_t Z       = arg_parser.get_int("wz");
+        ck_tile::index_t Y       = arg_parser.get_int("wy");
+        ck_tile::index_t X       = arg_parser.get_int("wx");
+        ck_tile::index_t Sz      = arg_parser.get_int("sz");
+        ck_tile::index_t Sy      = arg_parser.get_int("sy");
+        ck_tile::index_t Sx      = arg_parser.get_int("sx");
+        ck_tile::index_t Dz      = arg_parser.get_int("dz");
+        ck_tile::index_t Dy      = arg_parser.get_int("dy");
+        ck_tile::index_t Dx      = arg_parser.get_int("dx");
+        ck_tile::index_t LeftPz  = arg_parser.get_int("pdz");
+        ck_tile::index_t RightPz = arg_parser.get_int("pdzr");
+        ck_tile::index_t LeftPy  = arg_parser.get_int("phy");
+        ck_tile::index_t RightPy = arg_parser.get_int("phyr");
+        ck_tile::index_t LeftPx  = arg_parser.get_int("pwx");
+        ck_tile::index_t RightPx = arg_parser.get_int("pwxr");
+
+        bool verify   = arg_parser.get_int("verify") != 0;
+        int warmup    = arg_parser.get_int("warmup");
+        int repeat    = arg_parser.get_int("repeat");
+        int log_level = arg_parser.get_int("log");
+
+        ck_tile::index_t Zs = (Z - 1) * Dz + 1;
+        ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        ck_tile::index_t Xs = (X - 1) * Dx + 1;
+        ck_tile::index_t Do = (D + LeftPz + RightPz - Zs) / Sz + 1;
+        ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
+        ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
+
+        std::cout << "Pooling 3D benchmark: " << KERNEL_NAME << std::endl;
+        std::cout << "  Input:  NDHWC = " << N << "x" << D << "x" << H << "x" << W << "x" << C
+                  << std::endl;
+        std::cout << "  Output: NDHWC = " << N << "x" << Do << "x" << Ho << "x" << Wo << "x" << C
+                  << std::endl;
+        std::cout << "  Window: " << Z << "x" << Y << "x" << X << ", stride: " << Sz << "x" << Sy
+                  << "x" << Sx << ", dilation: " << Dz << "x" << Dy << "x" << Dx << std::endl;
+
+        ck_tile::HostTensor<InDataType> h_in({N, D, H, W, C});
+        ck_tile::HostTensor<OutDataType> h_out({N, Do, Ho, Wo, C});
+        ck_tile::HostTensor<OutDataType> h_out_ref({N, Do, Ho, Wo, C});
+        ck_tile::HostTensor<IndexDataType> h_out_index({N, Do, Ho, Wo, C});
+        ck_tile::HostTensor<IndexDataType> h_out_ref_index({N, Do, Ho, Wo, C});
+
+        ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(h_in);
+
+        ck_tile::DeviceMem d_in(h_in.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_out(h_out.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_out_index(h_out_index.get_element_space_size_in_bytes());
+
+        d_in.ToDevice(h_in.data());
+        d_out.SetZero();
+        d_out_index.SetZero();
+
+        auto input_shape  = ck_tile::make_tuple(N, D, H, W, C);
+        auto output_shape = ck_tile::make_tuple(N, Do, Ho, Wo, C);
+        auto input_strides =
+            ck_tile::make_tuple(D * H * W * C, H * W * C, W * C, C, ck_tile::index_t{1});
+        auto output_strides =
+            ck_tile::make_tuple(Do * Ho * Wo * C, Ho * Wo * C, Wo * C, C, ck_tile::index_t{1});
+        auto window_lengths   = ck_tile::make_tuple(Z, Y, X);
+        auto window_strides   = ck_tile::make_tuple(Sz, Sy, Sx);
+        auto window_dilations = ck_tile::make_tuple(Dz, Dy, Dx);
+        auto input_left_pads  = ck_tile::make_tuple(LeftPz, LeftPy, LeftPx);
+        auto input_right_pads = ck_tile::make_tuple(RightPz, RightPy, RightPx);
+
+        auto host_args = ck_tile::PoolHostArgs<decltype(input_shape), decltype(window_lengths)>{
+            d_in.GetDeviceBuffer(),
+            d_out.GetDeviceBuffer(),
+            d_out_index.GetDeviceBuffer(),
+            input_shape,
+            output_shape,
+            input_strides,
+            output_strides,
+            window_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads};
+
+        ck_tile::stream_config stream{nullptr, true, log_level, warmup, repeat};
+
+        float latency = 0;
+        try
+        {
+            latency = launch_selected_kernel(host_args, stream);
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << "Kernel launch failed: " << e.what() << std::endl;
+            return -1;
+        }
+
+        size_t bytes_read    = static_cast<size_t>(N) * D * H * W * C * sizeof(InDataType);
+        size_t bytes_written = static_cast<size_t>(N) * Do * Ho * Wo * C * sizeof(OutDataType);
+        float bandwidth      = (bytes_read + bytes_written) / (latency * 1e-3f) / 1e9f;
+
+        std::cout << "  Latency: " << latency << " ms" << std::endl;
+        std::cout << "  Bandwidth: " << bandwidth << " GB/s" << std::endl;
+
+        if(verify)
+        {
+            d_out.FromDevice(h_out.data());
+            d_out_index.FromDevice(h_out_index.data());
+
+            auto kernel_args =
+                ck_tile::PoolKernelArgs<decltype(input_shape), decltype(window_lengths)>{
+                    h_in.data(),
+                    h_out_ref.data(),
+                    h_out_ref_index.data(),
+                    input_shape,
+                    output_shape,
+                    input_strides,
+                    output_strides,
+                    window_lengths,
+                    window_strides,
+                    window_dilations,
+                    input_left_pads,
+                    input_right_pads};
+
+            ck_tile::reference_pool3d<InDataType,
+                                      ComputeDataType,
+                                      OutDataType,
+                                      IndexDataType,
+                                      ReduceOpType,
+                                      decltype(input_shape),
+                                      decltype(window_lengths),
+                                      SelectedKernel::kOutputIndex>(
+                h_in, h_out_ref, h_out_ref_index, kernel_args, ReduceOpType{});
+
+            bool pass_value =
+                ck_tile::check_err(h_out, h_out_ref, "Error: Incorrect values!", 1e-3, 1e-3);
+            std::cout << "  Verification: " << (pass_value ? "PASS" : "FAIL") << std::endl;
+
+            if(SelectedKernel::kOutputIndex)
+            {
+                bool pass_index = ck_tile::check_err(
+                    h_out_index, h_out_ref_index, "Error: Incorrect indices!", 0, 0);
+                std::cout << "  Index verification: " << (pass_index ? "PASS" : "FAIL")
+                          << std::endl;
+            }
+        }
+
+        return 0;
+    }
+}
+
+int main(int argc, char* argv[]) { return benchmark_pooling<POOLING_DIM>(argc, argv); }
diff --git a/tile_engine/ops/pooling/pooling_common.hpp b/tile_engine/ops/pooling/pooling_common.hpp
new file mode 100644
index 0000000000..313fbac332
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_common.hpp
@@ -0,0 +1,52 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <iostream>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/pooling.hpp"
+
+namespace ck_tile {
+
+/// @brief Kernel trait parameters for pooling tile_engine configurations
+struct PoolingKernelTraits
+{
+    std::string reduce_op; // "max", "min", or "avg"
+    bool output_index;     // Whether to output indices (max pooling)
+    bool propagate_nan;    // Whether to propagate NaN values
+    bool cross_warp;       // Whether cross-warp reduction is used
+
+    std::string to_string() const
+    {
+        std::ostringstream oss;
+        oss << reduce_op << "_" << (output_index ? "idx" : "noidx") << "_"
+            << (propagate_nan ? "nan" : "nonan") << "_"
+            << (cross_warp ? "crosswarp" : "nocrosswarp");
+        return oss.str();
+    }
+};
+
+/// @brief Extract traits from a kernel name string
+inline PoolingKernelTraits extract_pooling_traits_from_name(const std::string& name)
+{
+    PoolingKernelTraits traits;
+    if(name.find("max") != std::string::npos)
+        traits.reduce_op = "max";
+    else if(name.find("min") != std::string::npos)
+        traits.reduce_op = "min";
+    else
+        traits.reduce_op = "avg";
+    traits.output_index =
+        (name.find("idx") != std::string::npos) && (name.find("noidx") == std::string::npos);
+    traits.propagate_nan =
+        (name.find("nan") != std::string::npos) && (name.find("nonan") == std::string::npos);
+    traits.cross_warp = (name.find("crosswarp") != std::string::npos) &&
+                        (name.find("nocrosswarp") == std::string::npos);
+    return traits;
+}
+
+} // namespace ck_tile
diff --git a/tile_engine/ops/pooling/pooling_instance_builder.py b/tile_engine/ops/pooling/pooling_instance_builder.py
new file mode 100644
index 0000000000..0495ee3348
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_instance_builder.py
@@ -0,0 +1,551 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Pooling kernel instance builder for tile_engine.
+
+Generates C++ kernel headers for pooling operations with specific tile
+configurations and trait combinations.
+
+Usage:
+    --list_kernels: List valid kernel configurations
+    --gen_single:   Generate a single kernel header
+    --gen_individual: Generate all kernel headers
+"""
+
+import os
+import json
+import argparse
+import itertools
+import multiprocessing
+import concurrent.futures
+from pathlib import Path
+import logging
+
+from pooling_validation_utils import (
+    is_tile_config_valid,
+    is_trait_combination_valid,
+    get_dtype_string,
+    get_reduce_op_string,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PoolingKernelBuilder:
+    def __init__(self, working_path, datatype, config_json=None):
+        self.working_path = Path(working_path)
+        self.datatype = datatype
+        self.config_json = config_json
+
+        # Create working directory if it doesn't exist
+        self.working_path.mkdir(parents=True, exist_ok=True)
+
+        # Load configuration
+        if config_json and os.path.exists(config_json):
+            with open(config_json, "r") as f:
+                self.config = json.load(f)
+        else:
+            self.config = self._get_default_config()
+
+    def _get_default_config(self):
+        """Return default configuration if no config file is provided"""
+        return {
+            "tile_config": {
+                "block_m": {"values": [64,128,256]},
+                "block_n": {"values": [1,2]},
+                "warp_m": {"values": [1]},
+                "warp_n": {"values": [1]},
+                "warp_tile_m": {"values": [128]},
+                "warp_tile_n": {"values": [1]},
+                "thread_tile_m": {"values": [1,2,4]},
+                "thread_tile_n": {"values": [1]},
+            },
+            "trait_config": {
+                "reduce_op": {"values": ["max", "min", "avg"]},
+                "output_index": {"values": [True, False]},
+                "propagate_nan": {"values": [True, False]},
+                "pooling_dim": {"values": ["2d", "3d"]},
+            },
+        }
+
+    def _get_tile_configs(self, fast_mode=False):
+        """Get tile configurations from config"""
+        if "tile_config" not in self.config:
+            return []
+
+        tile_config = self.config["tile_config"]
+
+        block_m_values = tile_config.get("block_m", {}).get("values", [64,128,256])
+        block_n_values = tile_config.get("block_n", {}).get("values", [1,2])
+        warp_m_values = tile_config.get("warp_m", {}).get("values", [1])
+        warp_n_values = tile_config.get("warp_n", {}).get("values", [1])
+        warp_tile_m_values = tile_config.get("warp_tile_m", {}).get("values", [128])
+        warp_tile_n_values = tile_config.get("warp_tile_n", {}).get("values", [1])
+        thread_tile_m_values = tile_config.get("thread_tile_m", {}).get("values", [1,2,4])
+        thread_tile_n_values = tile_config.get("thread_tile_n", {}).get("values", [1])
+
+        configs = []
+        for block_m in block_m_values:
+            for block_n in block_n_values:
+                for warp_m in warp_m_values:
+                    for warp_n in warp_n_values:
+                        for warp_tile_m in warp_tile_m_values:
+                            for warp_tile_n in warp_tile_n_values:
+                                for thread_tile_m in thread_tile_m_values:
+                                    for thread_tile_n in thread_tile_n_values:
+                                        if self._validate_tile_config(
+                                            block_m,
+                                            block_n,
+                                            warp_m,
+                                            warp_n,
+                                            warp_tile_m,
+                                            warp_tile_n,
+                                            thread_tile_m,
+                                            thread_tile_n,
+                                            fast_mode=fast_mode,
+                                        ):
+                                            configs.append(
+                                                {
+                                                    "block_m": block_m,
+                                                    "block_n": block_n,
+                                                    "warp_m": warp_m,
+                                                    "warp_n": warp_n,
+                                                    "warp_tile_m": warp_tile_m,
+                                                    "warp_tile_n": warp_tile_n,
+                                                    "thread_tile_m": thread_tile_m,
+                                                    "thread_tile_n": thread_tile_n,
+                                                }
+                                            )
+        return configs
+
+    def _validate_tile_config(
+        self,
+        block_m,
+        block_n,
+        warp_m,
+        warp_n,
+        warp_tile_m,
+        warp_tile_n,
+        thread_tile_m,
+        thread_tile_n,
+        fast_mode=False,
+    ):
+        """Validate tile configuration via pooling_validation_utils."""
+        return is_tile_config_valid(
+            block_m,
+            block_n,
+            warp_m,
+            warp_n,
+            warp_tile_m,
+            warp_tile_n,
+            thread_tile_m,
+            thread_tile_n,
+            self.datatype,
+            self.datatype,
+            fast_mode=fast_mode,
+        )
+
+    def _generate_trait_combinations(self):
+        """Generate all combinations of traits"""
+        if "trait_config" not in self.config:
+            return [("max", True, False, "2d")]
+
+        trait_config = self.config["trait_config"]
+
+        reduce_ops = trait_config.get("reduce_op", {}).get("values", ["min","max","avg"])
+        output_indices = trait_config.get("output_index", {}).get("values", [True, False])
+        propagate_nans = trait_config.get("propagate_nan", {}).get("values", [True, False])
+        pooling_dims = trait_config.get("pooling_dim", {}).get("values", ["2d", "3d"])
+
+        all_combinations = list(
+            itertools.product(reduce_ops, output_indices, propagate_nans, pooling_dims)
+        )
+
+        # Filter valid combinations
+        combinations = []
+        for combo in all_combinations:
+            reduce_op, output_index, propagate_nan, pooling_dim = combo
+            if is_trait_combination_valid(
+                reduce_op, output_index, propagate_nan, pooling_dim
+            ):
+                combinations.append(combo)
+            else:
+                logger.debug(
+                    f"Skipping unsupported trait combination: {reduce_op}-{output_index}-{propagate_nan}-{pooling_dim}"
+                )
+
+        return combinations
+
+    def _get_dtype_string(self):
+        """Get C++ type string for datatype."""
+        return get_dtype_string(self.datatype)
+
+    def _get_reduce_op_string(self, reduce_op):
+        """Get C++ reduce op type string."""
+        return get_reduce_op_string(reduce_op)
+
+    def _generate_kernel_instance(self, tile_config, trait_combo, is_header=True):
+        """Generate a single kernel instance header"""
+        reduce_op, output_index, propagate_nan, pooling_dim = trait_combo
+
+        # Create kernel name
+        kernel_name = (
+            f"pool_{self.datatype}_{pooling_dim}_{reduce_op}_"
+            f"{'idx' if output_index else 'noidx'}_"
+            f"{'nan' if propagate_nan else 'nonan'}"
+        )
+
+        # Create tile configuration string
+        tile_str = (
+            f"{tile_config['block_m']}x{tile_config['block_n']}_"
+            f"{tile_config['warp_m']}x{tile_config['warp_n']}_"
+            f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}_"
+            f"{tile_config['thread_tile_m']}x{tile_config['thread_tile_n']}"
+        )
+
+        kernel_name += f"_{tile_str}"
+
+        # Determine types
+        in_type = self._get_dtype_string()
+        out_type = in_type
+        compute_type = "float"  # Always use float for computation
+        index_type = "ck_tile::index_t"
+        reduce_op_type = self._get_reduce_op_string(reduce_op)
+
+        output_index_str = "true" if output_index else "false"
+        propagate_nan_str = "true" if propagate_nan else "false"
+
+        # Generate 2D or 3D specific code
+        if pooling_dim == "2d":
+            tensor_shape_type = "ck_tile::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t, ck_tile::index_t>"
+            window_shape_type = "ck_tile::tuple<ck_tile::index_t, ck_tile::index_t>"
+            window_rank = 2
+        else:
+            tensor_shape_type = "ck_tile::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t, ck_tile::index_t, ck_tile::index_t>"
+            window_shape_type = (
+                "ck_tile::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t>"
+            )
+            window_rank = 3
+
+        pragma_line = "#pragma once\n" if is_header else ""
+        instance_code = f"""// Generated kernel instance for {kernel_name}
+{pragma_line}
+#include <cstdint>
+#include <utility>
+#include <tuple>
+#include <iostream>
+#include <stdexcept>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/pooling.hpp"
+
+using InDataType = {in_type};
+using OutDataType = {out_type};
+using ComputeDataType = {compute_type};
+using IndexDataType = {index_type};
+using ReduceOpType = {reduce_op_type};
+
+using TensorShape = {tensor_shape_type};
+using WindowShape = {window_shape_type};
+
+// Kernel name for display
+constexpr const char* KERNEL_NAME = "{kernel_name}";
+constexpr int POOLING_DIM = {window_rank};
+
+// Wrapper for simplified launch interface
+struct SelectedKernel {{
+    // Tile configuration - PoolShape parameters
+    static constexpr ck_tile::index_t Block_M = {tile_config["block_m"]};
+    static constexpr ck_tile::index_t Block_N = {tile_config["block_n"]};
+    static constexpr ck_tile::index_t WarpPerBlock_M = {tile_config["warp_m"]};
+    static constexpr ck_tile::index_t WarpPerBlock_N = {tile_config["warp_n"]};
+    static constexpr ck_tile::index_t WarpTile_M = {tile_config["warp_tile_m"]};
+    static constexpr ck_tile::index_t WarpTile_N = {tile_config["warp_tile_n"]};
+    static constexpr ck_tile::index_t ThreadTile_M = {tile_config["thread_tile_m"]};
+    static constexpr ck_tile::index_t ThreadTile_N = {tile_config["thread_tile_n"]};
+
+    // Traits
+    static constexpr bool kOutputIndex = {output_index_str};
+    static constexpr bool kPropagateNan = {propagate_nan_str};
+
+    // Pool shape
+    using BlockWarps = ck_tile::sequence<WarpPerBlock_M, WarpPerBlock_N>;
+    using BlockTile = ck_tile::sequence<Block_M, Block_N>;
+    using WarpTile = ck_tile::sequence<WarpTile_M, WarpTile_N>;
+    using ThreadTile = ck_tile::sequence<ThreadTile_M, ThreadTile_N>;
+
+    using PoolShapeType = ck_tile::PoolShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+
+    // Problem and kernel types
+    using Problem = ck_tile::PoolProblem<InDataType,
+                                         OutDataType,
+                                         ComputeDataType,
+                                         IndexDataType,
+                                         ReduceOpType,
+                                         kOutputIndex,
+                                         kPropagateNan,
+                                         PoolShapeType>;
+    using Kernel = ck_tile::PoolKernel<Problem>;
+
+    static float launch(ck_tile::PoolHostArgs<TensorShape, WindowShape>& args,
+                        const ck_tile::stream_config& stream) {{
+
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+        const ck_tile::index_t kBlockSize = Kernel::BlockSize();
+
+        auto kernel_args = Kernel::MakeKernelArgs(args);
+
+        if (!Kernel::IsSupportedArgument(kernel_args)) {{
+            throw std::runtime_error(
+                std::string("Unsupported arguments for pooling kernel: ") + KERNEL_NAME);
+        }}
+
+        const ck_tile::index_t kGridSize = Kernel::CalculateGridSize(kernel_args);
+
+        if(stream.log_level_ > 0) {{
+            std::cout << "Launching pooling kernel: " << KERNEL_NAME << "\\n"
+                      << "  grid_size: " << kGridSize << ", block_size: " << kBlockSize
+                      << std::endl;
+        }}
+
+        return ck_tile::launch_kernel(
+            stream,
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{{}}, kGridSize, kBlockSize, 0, kernel_args));
+    }}
+}};
+"""
+        return kernel_name, instance_code
+
+    def write_kernel_list(self):
+        """Write kernel list to file for CMake to read"""
+        tile_configs = self._get_tile_configs(fast_mode=False)
+        trait_combos = self._generate_trait_combinations()
+
+        kernel_list = []
+        for tile_config in tile_configs:
+            for trait_combo in trait_combos:
+                reduce_op, output_index, propagate_nan, pooling_dim = trait_combo
+
+                kernel_name = (
+                    f"pool_{self.datatype}_{pooling_dim}_{reduce_op}_"
+                    f"{'idx' if output_index else 'noidx'}_"
+                    f"{'nan' if propagate_nan else 'nonan'}"
+                )
+
+                tile_str = (
+                    f"{tile_config['block_m']}x{tile_config['block_n']}_"
+                    f"{tile_config['warp_m']}x{tile_config['warp_n']}_"
+                    f"{tile_config['warp_tile_m']}x{tile_config['warp_tile_n']}_"
+                    f"{tile_config['thread_tile_m']}x{tile_config['thread_tile_n']}"
+                )
+
+                kernel_name += f"_{tile_str}"
+
+                trait_str = (
+                    f"{reduce_op}_"
+                    f"{'true' if output_index else 'false'}_"
+                    f"{'true' if propagate_nan else 'false'}_"
+                    f"{pooling_dim}"
+                )
+
+                kernel_list.append(
+                    {
+                        "name": kernel_name,
+                        "tile_config": tile_config,
+                        "trait_combo": trait_combo,
+                        "tile_str": tile_str,
+                        "trait_str": trait_str,
+                    }
+                )
+
+        # Write kernel count
+        with open(self.working_path / "pool_kernel_count.txt", "w") as f:
+            f.write(str(len(kernel_list)))
+
+        # Write kernel list
+        with open(self.working_path / "pool_kernel_list.txt", "w") as f:
+            for kernel in kernel_list:
+                f.write(
+                    f"{kernel['name']}|{kernel['tile_str']}|{kernel['trait_str']}\n"
+                )
+
+        print(f"Listed {len(kernel_list)} kernel configurations")
+
+    def generate_individual(self, num_workers=None):
+        """Generate individual kernel files with parallel processing"""
+        if num_workers is None:
+            num_workers = min(multiprocessing.cpu_count(), 8)
+
+        tile_configs = self._get_tile_configs()
+        trait_combos = self._generate_trait_combinations()
+
+        work_items = []
+        for tile_config in tile_configs:
+            for trait_combo in trait_combos:
+                work_items.append(
+                    (
+                        tile_config,
+                        trait_combo,
+                        self.working_path,
+                        self.datatype,
+                    )
+                )
+
+        print(
+            f"Generating {len(work_items)} individual kernel files using {num_workers} workers..."
+        )
+
+        kernel_list = []
+        completed = 0
+
+        with concurrent.futures.ProcessPoolExecutor(
+            max_workers=num_workers
+        ) as executor:
+            future_to_item = {
+                executor.submit(_generate_single_kernel_individual, item): item
+                for item in work_items
+            }
+
+            for future in concurrent.futures.as_completed(future_to_item):
+                completed += 1
+                if completed % 10 == 0 or completed == len(work_items):
+                    print(
+                        f"  Progress: {completed}/{len(work_items)} kernels generated"
+                    )
+
+                try:
+                    result = future.result()
+                    if result:
+                        kernel_list.append(result)
+                except Exception as exc:
+                    item = future_to_item[future]
+                    print(f"Kernel generation failed for {item}: {exc}")
+
+        kernel_list.sort(key=lambda x: x[0])
+        print(
+            f"Generated {len(kernel_list)} individual kernel files in {self.working_path}"
+        )
+
+    def run(self, num_workers=None):
+        """Run the builder to generate individual kernel files"""
+        self.generate_individual(num_workers)
+
+
+def _generate_single_kernel_individual(work_item):
+    """Worker function to generate a single individual kernel file"""
+    tile_config, trait_combo, working_path, datatype = work_item
+
+    builder = PoolingKernelBuilder(working_path, datatype)
+
+    try:
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            tile_config, trait_combo
+        )
+
+        header_file = working_path / f"pooling_single_{kernel_name}.hpp"
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        return (kernel_name, trait_combo, tile_config)
+    except Exception as e:
+        print(f"Error generating individual kernel: {e}")
+        return None
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser(
+        description="Pooling kernel instance builder for tile_engine"
+    )
+    parser.add_argument("--working_path", required=True, help="Working directory path")
+    parser.add_argument(
+        "--datatype",
+        required=True,
+        choices=["fp8", "fp16", "bf16", "fp32"],
+        help="Data type",
+    )
+    parser.add_argument("--config_json", help="Configuration JSON file")
+    parser.add_argument(
+        "--num_workers", type=int, help="Number of parallel workers (default: auto)"
+    )
+    parser.add_argument(
+        "--gen_individual", action="store_true", help="Generate individual kernel files"
+    )
+    parser.add_argument(
+        "--gen_single", action="store_true", help="Generate a single kernel file"
+    )
+    parser.add_argument("--kernel_name", help="Kernel name for single generation")
+    parser.add_argument(
+        "--tile_config", help="Tile configuration string for single generation"
+    )
+    parser.add_argument(
+        "--trait_combo", help="Trait combination string for single generation"
+    )
+    parser.add_argument(
+        "--list_kernels",
+        action="store_true",
+        help="List kernel configurations without generating files",
+    )
+
+    args = parser.parse_args()
+
+    builder = PoolingKernelBuilder(args.working_path, args.datatype, args.config_json)
+
+    if args.list_kernels:
+        builder.write_kernel_list()
+    elif args.gen_single:
+        if not args.kernel_name or not args.tile_config or not args.trait_combo:
+            parser.error(
+                "--gen_single requires --kernel_name, --tile_config, and --trait_combo"
+            )
+
+        # Parse tile config: "block_mx block_n_warp_mxwarp_n_warp_tile_mxwarp_tile_n_thread_tile_mxthread_tile_n"
+        tile_parts = args.tile_config.split("_")
+        block_dims = tile_parts[0].split("x")
+        warp_dims = tile_parts[1].split("x")
+        warp_tile_dims = tile_parts[2].split("x")
+        thread_tile_dims = tile_parts[3].split("x")
+
+        tile_config = {
+            "block_m": int(block_dims[0]),
+            "block_n": int(block_dims[1]),
+            "warp_m": int(warp_dims[0]),
+            "warp_n": int(warp_dims[1]),
+            "warp_tile_m": int(warp_tile_dims[0]),
+            "warp_tile_n": int(warp_tile_dims[1]),
+            "thread_tile_m": int(thread_tile_dims[0]),
+            "thread_tile_n": int(thread_tile_dims[1]),
+        }
+
+        # Parse trait combo: "reduce_op_output_index_propagate_nan_pooling_dim"
+        trait_parts = args.trait_combo.split("_")
+        trait_combo = (
+            trait_parts[0],  # reduce_op
+            trait_parts[1].lower() == "true",  # output_index
+            trait_parts[2].lower() == "true",  # propagate_nan
+            trait_parts[3],  # pooling_dim
+        )
+
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            tile_config, trait_combo
+        )
+
+        header_file = builder.working_path / f"pooling_single_{kernel_name}.hpp"
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        print(f"Generated {header_file}")
+
+    elif args.gen_individual:
+        builder.run(args.num_workers)
+    else:
+        parser.error(
+            "Must specify one of: --list_kernels, --gen_individual, or --gen_single"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tile_engine/ops/pooling/pooling_validation_utils.py b/tile_engine/ops/pooling/pooling_validation_utils.py
new file mode 100644
index 0000000000..27859064e4
--- /dev/null
+++ b/tile_engine/ops/pooling/pooling_validation_utils.py
@@ -0,0 +1,487 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Validation utilities for pooling tile_engine configurations.
+
+Validates tile configurations, trait combinations, and datatype support for
+pooling kernels.  Modelled after gemm_validation_utils.py — each constraint
+from the CK PoolShape / PoolKernel static_asserts is mirrored here so that
+invalid configs are rejected at code-generation time rather than at compile
+or runtime.
+"""
+
+import logging
+from typing import List, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Hardware constants
+# ---------------------------------------------------------------------------
+
+# Default warp size (wave64 for CDNA architectures)
+WARP_SIZE = 64
+MAX_BLOCK_SIZE = 1024  # Maximum threads per workgroup on AMD GPUs
+MAX_LDS_BYTES = 65536  # 64 KB LDS per workgroup
+
+def get_warp_size_for_gpu(gpu_target: str) -> int:
+    """Get the warp size for a given GPU target.
+
+    CDNA architectures (gfx9xx) use WAVE64 (64 threads per wavefront).
+    RDNA architectures (gfx10xx, gfx11xx, gfx12xx) use WAVE32 (32 threads per wavefront).
+    """
+    if gpu_target.startswith("gfx9"):
+        return 64  # CDNA - WAVE64
+    return 32  # RDNA and others - WAVE32
+
+# ---------------------------------------------------------------------------
+# Datatype helpers
+# ---------------------------------------------------------------------------
+
+ELEMENT_SIZE_MAP = {
+    "fp8": 1,
+    "bf8": 1,
+    "int8": 1,
+    "fp16": 2,
+    "bf16": 2,
+    "int4": 0.5,
+    "int32": 4,
+    "fp32": 4,
+    "fp64": 8,
+}
+
+DTYPE_STRING_MAP = {
+    "fp8": "ck_tile::fp8_t",
+    "bf8": "ck_tile::bf8_t",
+    "fp16": "ck_tile::fp16_t",
+    "bf16": "ck_tile::bf16_t",
+    "fp32": "float",
+    "fp64": "double",
+}
+
+SUPPORTED_DATATYPES = list(DTYPE_STRING_MAP.keys())
+
+# ---------------------------------------------------------------------------
+# Reduce-op helpers
+# ---------------------------------------------------------------------------
+
+REDUCE_OP_STRING_MAP = {
+    "max": "ck_tile::ReduceOp::Max",
+    "min": "ck_tile::ReduceOp::Min",
+    "avg": "ck_tile::ReduceOp::Add",
+}
+
+SUPPORTED_REDUCE_OPS = list(REDUCE_OP_STRING_MAP.keys())
+
+SUPPORTED_POOLING_DIMS = ("2d", "3d")
+
+# ---------------------------------------------------------------------------
+# Public helper functions (used by the instance builder)
+# ---------------------------------------------------------------------------
+
+
+def element_size(datatype: str) -> float:
+    """Return the byte-width of a single element for *datatype*."""
+    datatype = datatype.lower()
+    if datatype not in ELEMENT_SIZE_MAP:
+        raise ValueError(
+            f"Unsupported data type: '{datatype}'. "
+            f"Supported: {list(ELEMENT_SIZE_MAP.keys())}"
+        )
+    return ELEMENT_SIZE_MAP[datatype]
+
+
+def get_dtype_string(datatype: str) -> str:
+    """Return the C++ type string (e.g. ``ck_tile::fp16_t``) for *datatype*."""
+    return DTYPE_STRING_MAP.get(datatype, "float")
+
+
+def get_reduce_op_string(reduce_op: str) -> str:
+    """Return the C++ ReduceOp enumerator string for *reduce_op*."""
+    return REDUCE_OP_STRING_MAP.get(reduce_op, "ck_tile::ReduceOp::Max")
+
+
+# ---------------------------------------------------------------------------
+# Individual tile-config validators
+# ---------------------------------------------------------------------------
+
+
+def validate_positivity(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+) -> Tuple[bool, str]:
+    """All tile parameters must be positive integers."""
+    params = {
+        "block_m": block_m,
+        "block_n": block_n,
+        "warp_m": warp_m,
+        "warp_n": warp_n,
+        "warp_tile_m": warp_tile_m,
+        "warp_tile_n": warp_tile_n,
+        "thread_tile_m": thread_tile_m,
+        "thread_tile_n": thread_tile_n,
+    }
+    for name, val in params.items():
+        if val <= 0:
+            return False, f"{name} ({val}) must be > 0"
+    return True, ""
+
+
+def validate_power_of_two(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+) -> Tuple[bool, str]:
+    """All tile parameters should be powers of two for correct GPU addressing."""
+    params = {
+        "block_m": block_m,
+        "block_n": block_n,
+        "warp_m": warp_m,
+        "warp_n": warp_n,
+        "warp_tile_m": warp_tile_m,
+        "warp_tile_n": warp_tile_n,
+        "thread_tile_m": thread_tile_m,
+        "thread_tile_n": thread_tile_n,
+    }
+    for name, val in params.items():
+        if val > 0 and (val & (val - 1)) != 0:
+            return False, f"{name} ({val}) is not a power of two"
+    return True, ""
+
+
+def validate_thread_tile_alignment(
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+) -> Tuple[bool, str]:
+    """
+    Mirrors pool_shape.hpp:
+      static_assert(Warp_M % ThreadTile_M == 0);
+      static_assert(Warp_N % ThreadTile_N == 0);
+    """
+    if warp_tile_m % thread_tile_m != 0:
+        return (
+            False,
+            f"warp_tile_m ({warp_tile_m}) must be divisible by "
+            f"thread_tile_m ({thread_tile_m})",
+        )
+    if warp_tile_n % thread_tile_n != 0:
+        return (
+            False,
+            f"warp_tile_n ({warp_tile_n}) must be divisible by "
+            f"thread_tile_n ({thread_tile_n})",
+        )
+    return True, ""
+
+
+def validate_warp_thread_distribution(
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+    warp_size: int = WARP_SIZE,
+) -> Tuple[bool, str]:
+    """
+    Mirrors pool_shape.hpp:
+      static_assert((Warp_M * Warp_N / ThreadTile_M / ThreadTile_N)
+                    % get_warp_size() == 0);
+    """
+    threads_per_warp = (warp_tile_m * warp_tile_n) // (thread_tile_m * thread_tile_n)
+    if threads_per_warp % warp_size != 0:
+        return (
+            False,
+            f"(warp_tile_m * warp_tile_n) / (thread_tile_m * thread_tile_n) = "
+            f"{threads_per_warp} is not a multiple of warp_size ({warp_size})",
+        )
+    return True, ""
+
+
+def _compute_warp_size_scale_factors(
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+    warp_size: int = WARP_SIZE,
+) -> Tuple[int, int]:
+    """
+    Reproduce the WarpSizeScaleFactor_M / _N logic from pool_shape.hpp.
+    """
+    threads_per_warp = (warp_tile_m * warp_tile_n) // (thread_tile_m * thread_tile_n)
+    scale = threads_per_warp // warp_size
+
+    if warp_tile_m // thread_tile_m > warp_tile_n // thread_tile_n:
+        return scale, 1
+    return 1, scale
+
+
+def validate_block_tile_coverage(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+    warp_size: int = WARP_SIZE,
+) -> Tuple[bool, str]:
+    """
+    Mirrors pool_shape.hpp:
+      static_assert((Block_M * WarpSizeScaleFactor_M) %
+                    (WarpPerBlock_M * Warp_M) == 0);
+      static_assert((Block_N * WarpSizeScaleFactor_N) %
+                    (WarpPerBlock_N * Warp_N) == 0);
+    """
+    sf_m, sf_n = _compute_warp_size_scale_factors(
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n, warp_size
+    )
+
+    if (block_m * sf_m) % (warp_m * warp_tile_m) != 0:
+        return (
+            False,
+            f"block_m*ScaleFactor_M ({block_m}*{sf_m}={block_m * sf_m}) must be "
+            f"divisible by warp_m*warp_tile_m ({warp_m}*{warp_tile_m}"
+            f"={warp_m * warp_tile_m})",
+        )
+    if (block_n * sf_n) % (warp_n * warp_tile_n) != 0:
+        return (
+            False,
+            f"block_n*ScaleFactor_N ({block_n}*{sf_n}={block_n * sf_n}) must be "
+            f"divisible by warp_n*warp_tile_n ({warp_n}*{warp_tile_n}"
+            f"={warp_n * warp_tile_n})",
+        )
+    return True, ""
+
+
+def validate_block_size(
+    warp_m: int,
+    warp_n: int,
+    warp_size: int = WARP_SIZE,
+) -> Tuple[bool, str]:
+    """BlockSize = warp_size * warp_m * warp_n must be <= MAX_BLOCK_SIZE."""
+    block_size = warp_size * warp_m * warp_n
+    if block_size > MAX_BLOCK_SIZE:
+        return (
+            False,
+            f"BlockSize ({block_size} = {warp_size}*{warp_m}*{warp_n}) "
+            f"exceeds maximum ({MAX_BLOCK_SIZE})",
+        )
+    return True, ""
+
+
+def validate_vector_load_alignment(
+    block_m: int,
+    thread_tile_m: int,
+    in_datatype: str,
+) -> Tuple[bool, str]:
+    """
+    The M-dimension thread-tile determines the contiguous vector load width.
+    It must produce a load whose byte-width divides 16 bytes (max global
+    vector load width on AMD GPUs) and is at least 1 element wide.
+    """
+    elem_bytes = element_size(in_datatype)
+    load_bytes = thread_tile_m * elem_bytes
+    if load_bytes > 16:
+        return (
+            False,
+            f"thread_tile_m ({thread_tile_m}) * element_size({in_datatype}, "
+            f"{elem_bytes}B) = {load_bytes}B exceeds 16B max vector load",
+        )
+    if 16 % load_bytes != 0 and load_bytes % 16 != 0:
+        return (
+            False,
+            f"Vector load width ({load_bytes}B) is not a divisor of 16B",
+        )
+    return True, ""
+
+
+def validate_repeat_factors(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+) -> Tuple[bool, str]:
+    """
+    Repeat_M and Repeat_N from pool_shape.hpp must be >= 1.  They are the
+    number of tile iterations each warp performs within the block.
+    """
+    sf_m, sf_n = _compute_warp_size_scale_factors(
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n
+    )
+    repeat_m = (block_m * sf_m) // (warp_m * warp_tile_m)
+    repeat_n = (block_n * sf_n) // (warp_n * warp_tile_n)
+    if repeat_m < 1:
+        return False, f"Repeat_M ({repeat_m}) must be >= 1"
+    if repeat_n < 1:
+        return False, f"Repeat_N ({repeat_n}) must be >= 1"
+    return True, ""
+
+
+# ---------------------------------------------------------------------------
+# Comprehensive tile-config validation (entry point)
+# ---------------------------------------------------------------------------
+
+
+def is_tile_config_valid(
+    block_m: int,
+    block_n: int,
+    warp_m: int,
+    warp_n: int,
+    warp_tile_m: int,
+    warp_tile_n: int,
+    thread_tile_m: int,
+    thread_tile_n: int,
+    in_datatype: str,
+    out_datatype: str,
+    fast_mode: bool = False,
+    gpu_target: str = "gfx90a",
+) -> bool:
+    """
+    Comprehensive pooling tile configuration validation.
+
+    When *fast_mode* is True only cheap sanity checks are performed (useful
+    for the ``--list_kernels`` path).  Full mode mirrors every
+    ``static_assert`` in ``pool_shape.hpp``.
+
+    Parameters
+    ----------
+    block_m, block_n : Block tile dimensions (M = output elems, N = window).
+    warp_m, warp_n   : Warps per block along each dimension.
+    warp_tile_m, warp_tile_n : Tile processed per warp.
+    thread_tile_m, thread_tile_n : Contiguous elements per thread.
+    in_datatype  : Input element type (e.g. ``"fp16"``).
+    out_datatype : Output element type.
+    fast_mode    : Skip expensive checks when True.
+    """
+    all_params = (
+        block_m, block_n, warp_m, warp_n,
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n,
+    )
+
+    # --- Positivity (always) ---
+    ok, err = validate_positivity(*all_params)
+    if not ok:
+        logger.debug(f"Positivity check failed: {err}")
+        return False
+
+    # --- Thread-tile alignment (always) ---
+    ok, err = validate_thread_tile_alignment(
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n
+    )
+    if not ok:
+        logger.debug(f"Thread tile alignment failed: {err}")
+        return False
+
+    if fast_mode:
+        return True
+
+    # Get the warp size for this GPU target
+    warp_size = get_warp_size_for_gpu(gpu_target)
+
+    # --- Power-of-two ---
+    ok, err = validate_power_of_two(*all_params)
+    if not ok:
+        logger.debug(f"Power-of-two check failed: {err}")
+        return False
+
+    # --- Warp-thread distribution ---
+    ok, err = validate_warp_thread_distribution(
+        warp_tile_m, warp_tile_n, thread_tile_m, thread_tile_n, warp_size
+    )
+    if not ok:
+        logger.debug(f"Warp thread distribution failed: {err}")
+        return False
+
+    # --- Block-tile coverage ---
+    ok, err = validate_block_tile_coverage(*all_params, warp_size=warp_size)
+    if not ok:
+        logger.debug(f"Block tile coverage failed: {err}")
+        return False
+
+    # --- Block size ---
+    ok, err = validate_block_size(warp_m, warp_n, warp_size)
+    if not ok:
+        logger.debug(f"Block size check failed: {err}")
+        return False
+
+    # --- Repeat factors ---
+    ok, err = validate_repeat_factors(*all_params)
+    if not ok:
+        logger.debug(f"Repeat factor check failed: {err}")
+        return False
+
+    # --- Vector load alignment ---
+    ok, err = validate_vector_load_alignment(block_m, thread_tile_m, in_datatype)
+    if not ok:
+        logger.debug(f"Vector load alignment failed: {err}")
+        return False
+
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Trait-combination validation
+# ---------------------------------------------------------------------------
+
+
+def is_trait_combination_valid(
+    reduce_op: str,
+    output_index: bool,
+    propagate_nan: bool,
+    pooling_dim: str,
+) -> bool:
+    """
+    Validate a pooling trait combination.
+
+    Parameters
+    ----------
+    reduce_op    : ``"max"``, ``"min"``, or ``"avg"``.
+    output_index : Whether to output indices of the selected elements.
+    propagate_nan: Whether to propagate NaN values through the reduction.
+    pooling_dim  : ``"2d"`` or ``"3d"``.
+    """
+    if reduce_op not in SUPPORTED_REDUCE_OPS:
+        logger.debug(f"Unsupported reduce_op: '{reduce_op}'")
+        return False
+
+    if pooling_dim not in SUPPORTED_POOLING_DIMS:
+        logger.debug(f"Invalid pooling dimension: '{pooling_dim}'")
+        return False
+
+    # output_index only makes sense for max pooling (CK constraint)
+    if output_index and reduce_op != "max":
+        logger.debug(
+            f"output_index=True is only supported for 'max' pooling, "
+            f"not '{reduce_op}'"
+        )
+        return False
+
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Datatype validation
+# ---------------------------------------------------------------------------
+
+
+def is_datatype_supported(datatype: str) -> bool:
+    """Return True if *datatype* is a known pooling datatype."""
+    return datatype.lower() in ELEMENT_SIZE_MAP
diff --git a/tile_engine/ops/reduce/CMakeLists.txt b/tile_engine/ops/reduce/CMakeLists.txt
index fa62890a5c..6cb5db239a 100644
--- a/tile_engine/ops/reduce/CMakeLists.txt
+++ b/tile_engine/ops/reduce/CMakeLists.txt
@@ -11,7 +11,7 @@ set(MULTI_REDUCE_VARIANTS "multiops_multiblock;multiops_threadwise" CACHE STRING
 function(build_multi_reduce_for_datatype datatype variant)
     # Filter GPU targets to only gfx942, and gfx950
     set(GPU_TARGETS "")
-    set(DESIRED_TARGETS "gfx942;gfx950")
+    set(DESIRED_TARGETS "gfx942;gfx950;gfx12-generic")
     set(VALID_VARIANTS "multiops_multiblock;multiops_threadwise")
     
     foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
@@ -22,11 +22,11 @@ function(build_multi_reduce_for_datatype datatype variant)
     
     # Skip compilation if no matching targets found
     if(NOT GPU_TARGETS)
-        message(WARNING "Skipping Tile Engine for Multi Reduction Kernel: No supported GPU targets (gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+        message(WARNING "Skipping Tile Engine for Multi Reduction Kernel: No supported GPU targets (gfx942, gfx950, gfx12-generic) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
         return()
     endif()
     
-    message(STATUS "Building Reduction for GPU targets: ${GPU_TARGETS}")
+    message(VERBOSE "Building Reduction for GPU targets: ${GPU_TARGETS}")
     
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${variant}")
     file(MAKE_DIRECTORY "${working_path}")
@@ -75,7 +75,7 @@ function(build_multi_reduce_for_datatype datatype variant)
         message(FATAL_ERROR "Failed to generate kernels for ${datatype} ${variant}: ${ret}")
     endif()
 
-    message(STATUS "Generated ${datatype} ${variant} reduction kernel blobs at: ${working_path}")
+    message(VERBOSE "Generated ${datatype} ${variant} reduction kernel blobs at: ${working_path}")
 
     # # Add test executables for each generated test
     file(STRINGS "${working_path}/reduce_${variant}_blobs_list.txt" test_basenames)
@@ -85,7 +85,7 @@ function(build_multi_reduce_for_datatype datatype variant)
         set(test_src "${working_path}/${test_base}.cpp")
         set(test_target "${test_base}")
 
-        add_executable(${test_target} ${test_src})
+        add_executable(${test_target} EXCLUDE_FROM_ALL ${test_src})
         target_include_directories(${test_target} PRIVATE
             "${CMAKE_SOURCE_DIR}/test/ck_tile/reduce/"
             ${working_path}