Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa

2026-06-08 23:38:11 +00:00 · 2025-08-08 07:50:12 +00:00
parent 1ecee378d5 ab26026835
commit 6bb57c2c57
794 changed files with 42248 additions and 11458 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
    hooks:
    -   id: clang-format
        name: clang-format
-        entry: clang-format-12 -i --style=file
+        entry: clang-format-18 -i --style=file
        language: system
        types_or: [c++, inc]
    -   id: copyright-year-checker
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@

 Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).

-## Composable Kernel 1.1.0 for ROCm 6.5.0
+## Composable Kernel 1.1.0 for ROCm 7.0.0

 ### Added

@@ -19,10 +19,12 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
 * Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv)
+* Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv/bwd)
 * Added benchmarking support for tile engine GEMM.
 * Added Ping-pong scheduler support for GEMM operation along the K dimension.
 * Added rotating buffer feature for CK_Tile GEMM.
 * Added int8 support for CK_TILE GEMM.
+* Added support for elementwise kernel.

 ### Optimized

@@ -49,6 +51,10 @@ None

 None

+### Upcoming changes
+
+* Non-grouped convolutions are deprecated. All of their functionality is supported by grouped convolution.
+
 ## Composable Kernel 1.1.0 for ROCm 6.1.0

 ### Additions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,12 @@ add_compile_options(-Wno-pass-failed)
 add_compile_options(-Wno-switch-default)
 add_compile_options(-Wno-unique-object-duplication)

+# add -Og -gdwarf64 for debug builds
+add_compile_options(
+    "$<$<CONFIG:Debug>:-Og>"
+    "$<$<CONFIG:Debug>:-gdwarf64>"
+)
+
 # Recent change in compiler makes this warning ON by default, which led to compile errors.
 add_compile_options(-Wno-nrvo)

@@ -236,6 +242,8 @@ endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx950")
    add_definitions(-DCK_USE_NATIVE_MX_SUPPORT)
    set(CK_USE_NATIVE_MX_SUPPORT "ON")
+    add_definitions(-DCK_GFX950_SUPPORT)
+    set(CK_GFX950_SUPPORT "ON")
 endif()

 option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
@@ -334,7 +342,7 @@ find_package(Threads REQUIRED)
 link_libraries(Threads::Threads)

 ## C++
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
--- a/1
+++ b/1
@@ -62,6 +62,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libzstd-dev \
    openssh-server \
    clang-format-12 \
+    clang-format-18 \
    kmod && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
--- a/Dockerfile.aiter
+++ b/Dockerfile.aiter
@@ -0,0 +1,17 @@
+ARG BASE_DOCKER="rocm/pytorch:latest"
+FROM $BASE_DOCKER
+RUN groupadd -f render && \
+    pip install pandas zmq einops && \
+    pip install numpy==1.26.2 && \
+    sudo mkdir /home/jenkins && \
+    sudo mkdir /home/jenkins/workspace && \
+    cd /home/jenkins/workspace && \
+    rm -rf aiter && \
+    git clone --recursive https://github.com/ROCm/aiter.git && \
+    cd aiter && \
+    rm -rf 3rdparty/composable_kernel/ && \
+    git clone https://github.com/ROCm/composable_kernel.git 3rdparty/composable_kernel/ && \
+    python3 setup.py develop && \
+    chown -R jenkins:jenkins /home/jenkins/workspace && \
+    chmod -R a+rwx /home/jenkins/workspace && \
+    sudo usermod -aG irc jenkins
--- a/95
+++ b/95
@@ -188,12 +188,16 @@ def buildDocker(install_prefix){
    if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
        dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f Dockerfile.compiler . "
    }
+    else if(params.RUN_AITER_TESTS){
+        image_name = "rocm/composable_kernel:ck_aiter"
+        dockerArgs = dockerArgs + " --no-cache -f Dockerfile.aiter . "
+    }
    else{
        dockerArgs = dockerArgs + " -f Dockerfile . "
    }
    echo "Build Args: ${dockerArgs}"
    try{
-        if(params.BUILD_DOCKER){
+        if(params.BUILD_DOCKER || params.RUN_AITER_TESTS){
            //force building the new docker if that parameter is true
            echo "Building image: ${image_name}"
            retimage = docker.build("${image_name}", dockerArgs)
@@ -595,7 +599,7 @@ def Build_CK(Map conf=[:]){
                        if (params.RUN_FULL_QA && arch == 2 ){
                            // build deb packages
                            echo "Build packages"
-                            sh 'make -j package'
+                            sh 'ninja package'
                            archiveArtifacts artifacts: 'composablekernel*.deb'
                            sh 'mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.1.0_amd64.deb'
                            sh 'mv composablekernel-dev_*.deb composablekernel-dev_1.1.0_amd64.deb'
@@ -807,13 +811,62 @@ def process_results(Map conf=[:]){
    }
 }

+def run_aiter_tests(Map conf=[:]){
+    show_node_info()
+    env.HSA_ENABLE_SDMA=0
+    checkout scm
+    //use the latest pytorch image
+    def image = "rocm/composable_kernel:ck_aiter"
+    def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins"
+    def variant = env.STAGE_NAME
+    def retimage
+    def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
+    def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
+    dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
+    echo "Docker flags: ${dockerOpts}"
+
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+        try
+        {
+            echo "Pulling image: ${image}"
+            retimage = docker.image("${image}")
+            withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
+                retimage.pull()
+            }
+        }
+        catch(Exception ex)
+        {
+            error "Unable to locate image: ${image}"
+        }
+    }
+
+    withDockerContainer(image: image, args: dockerOpts) {
+        timeout(time: 45, unit: 'MINUTES'){
+            try{
+                sh "python3 --version"
+                sh "rocminfo"
+                sh "python3 ../aiter/op_tests/test_gemm_a8w8_blockscale.py"
+                //sh "python3 ../aiter/op_tests/test_mha.py"
+            }
+            catch(e){
+                echo "Throwing error exception while running AITER tests"
+                echo 'Exception occurred: ' + e.toString()
+                throw e
+            }
+            finally{
+                echo "Finished running AITER tests"
+            }
+        }
+    }
+}
+
 //launch develop branch daily jobs
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
-                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 13 * * * % BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false''' : ""
+                                              0 13 * * * % RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false''' : ""

 pipeline {
    agent none
@@ -914,8 +967,8 @@ pipeline {
            description: "Build CK and run tests on gfx90a (default: ON)")
        booleanParam(
            name: "BUILD_GFX942",
-            defaultValue: true,
-            description: "Build CK and run tests on gfx942 (default: ON)")
+            defaultValue: false,
+            description: "Build CK and run tests on gfx942 (default: OFF)")
        booleanParam(
            name: "BUILD_GFX950",
            defaultValue: false,
@@ -952,6 +1005,10 @@ pipeline {
            name: "RUN_ALL_UNIT_TESTS",
            defaultValue: false,
            description: "Run all unit tests (default: OFF)")
+        booleanParam(
+            name: "RUN_AITER_TESTS",
+            defaultValue: false,
+            description: "Run AITER tests with latest CK develop branch (default: OFF)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -994,7 +1051,7 @@ pipeline {
                                -o -iname \'*.cpp.in\' \
                                -o -iname \'*.cl\' \
                                | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\' && \
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\' && \
                                /cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \
                                -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 \
                                -D __gfx908__ -D __gfx90a__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
@@ -1023,7 +1080,7 @@ pipeline {
                                -o -iname \'*.cpp.in\' \
                                -o -iname \'*.cl\' \
                                | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\'"
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\'"
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
@@ -1032,6 +1089,24 @@ pipeline {
                }
            }
        }
+        stage("Run AITER Tests")
+        {
+            parallel
+            {
+                stage("Run AITER Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_AITER_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a")}
+                    steps{
+                        run_aiter_tests()
+                        cleanWs()
+                    }
+                }
+            }
+        }
        stage("Run Grouped Conv Large Case Tests")
        {
            parallel
@@ -1046,8 +1121,8 @@ pipeline {
                    environment{
                        setup_args = "NO_CK_BUILD"
                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_bwd_data_xdl_large_cases && \
-                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl && ./bin/test_grouped_convnd_bwd_data_xdl_large_cases"""
+                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_bwd_data_xdl_large_cases test_grouped_convnd_fwd_bias_clamp_large_cases && \
+                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl && ./bin/test_grouped_convnd_bwd_data_xdl_large_cases && ./bin/test_grouped_convnd_fwd_bias_clamp_large_cases"""
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
--- a/TERMINOLOGY.md
+++ b/TERMINOLOGY.md
@@ -1,2 +1,348 @@
 [Back to the main page](./README.md)
-# Composable Kernel terminology
+
+# Composable Kernel Terminology
+
+This document provides a technical reference for terminology used in the Composable Kernel library, organized by conceptual progression from hardware to machine learning operations.
+
+---
+
+## Glossary Index (Alphabetical)
+
+- [Add+Multiply](#addmultiply)
+- [Bank Conflict](#bank-conflict)
+- [Batched GEMM](#batched-gemm)
+- [Benchmark](#benchmark)
+- [Block Size](#block-size)
+- [Block Tile](#block-tile)
+- [Compute Unit (CU)](#compute-unit-cu)
+- [Coordinate Transformation Primitives](#coordinate-transformation-primitives)
+- [CUDA](#cuda)
+- [Dense Tensor](#dense-tensor)
+- [Descriptor](#descriptor)
+- [Device](#device)
+- [Elementwise](#elementwise)
+- [Epilogue](#epilogue)
+- [Fast Changing Dimension](#fast-changing-dimension)
+- [GEMM](#gemm-general-matrix-multiply)
+- [GEMV](#gemv)
+- [Grouped GEMM](#grouped-gemm)
+- [Global Memory](#global-memory)
+- [Grid](#grid)
+- [Host](#host)
+- [HIP](#hip)
+- [Inner Dimension](#inner-dimension)
+- [Inner Product](#inner-product)
+- [Input/Problem Shape](#inputproblem-shape)
+- [Kernel](#kernel)
+- [Launch Parameters](#launch-parameters)
+- [Load Tile](#load-tile)
+- [LDS Banks](#lds-banks)
+- [Matrix Core](#matrix-core)
+- [MFMA (Matrix Fused Multiply-Add)](#mfma-matrix-fused-multiply-add)
+- [Occupancy](#occupancy)
+- [Outer Dimension](#outer-dimension)
+- [Outer Product](#outer-product)
+- [Pinned Memory](#pinned-memory)
+- [Pipeline](#pipeline)
+- [Policy](#policy)
+- [Problem](#problem)
+- [Processing Units](#processing-units)
+- [Reference Kernel](#reference-kernel)
+- [Regression Test](#regression-test)
+- [ROCm](#rocm)
+- [Scalar General Purpose Register (SGPR)](#scalar-general-purpose-register-sgpr)
+- [Shared Memory / LDS (Local Data Share)](#shared-memory--lds-local-data-share)
+- [SIMT / SIMD](#simt--simd)
+- [Smoke Test](#smoke-test)
+- [Sparse Tensor](#sparse-tensor)
+- [Split-K GEMM](#split-k-gemm)
+- [Store Tile](#store-tile)
+- [Thread / Work-item](#thread--work-item)
+- [Thread Block / Work Group](#thread-block--work-group)
+- [Vanilla GEMM](#vanilla-gemm)
+- [Tile](#tile)
+- [Tile Distribution](#tile-distribution)
+- [Tile Partitioner](#tile-partitioner)
+- [Tile Programming API](#tile-programming-api)
+- [Tile Window](#tile-window)
+- [User Customized Tile Pipeline](#user-customized-tile-pipeline)
+- [User Customized Tile Pipeline Optimization](#user-customized-tile-pipeline-optimization)
+- [Vector](#vector)
+- [Vector General Purpose Register (VGPR)](#vector-general-purpose-register-vgpr)
+- [Warp / Wavefront](#warp--wavefront)
+- [Wave Tile](#wave-tile)
+- [XDL Instructions](#xdl-instructions)
+
+---
+
+## 1. Hardware and Memory
+
+### Processing Units
+The GPU is composed of multiple hardware units ([compute units (CUs)](#compute-unit-cu) on AMD, [streaming multiprocessors (SMs)](#compute-unit-cu) on NVIDIA), each containing many cores that run threads in parallel. These units manage shared resources and coordinate execution at scale.
+
+### Matrix Core
+Specialized GPU units that accelerate matrix operations for AI and deep learning tasks. Modern GPUs contain multiple matrix cores.
+
+### Compute Unit (CU)
+AMD's parallel vector processor in a GPU with multiple ALUs. Each compute unit will run all the waves in a workgroup. _This is equivalent to NVIDIA's streaming multiprocessor (SM)_.
+
+### Matrix Fused Multiply-Add (MFMA)
+AMD's matrix core instruction for efficient GEMM operations. CK optimizes kernel designs to maximize MFMA utilization and performance.
+
+### Registers
+The fastest memory tier, registers are private to each thread/work-item and used for storing temporary variables during computation. AMD distinguishes between [vector (VGPR)](#vector-general-purpose-register-vgpr) and [scalar (SGPR)](#scalar-general-purpose-register-sgpr) registers, while NVIDIA uses a unified register file.
+
+### Vector General Purpose Register (VGPR)
+Per-thread registers that store individual thread data within a wave. Each thread has its own set of VGPRs for private variables and calculations.
+
+### Scalar General Purpose Register (SGPR)
+Wave-level registers shared by all threads in a wave. Used for constants, addresses, and control flow common across the entire wave.
+
+### Shared Memory / Local Data Share (LDS)
+AMD's high-bandwidth, low-latency on-chip memory accessible to all threads within a work group. This is equivalent to NVIDIA's shared memory. It enables fast data sharing and synchronization, but is limited in capacity and must be managed to avoid [bank conflicts](#bank-conflict).
+
+### LDS Banks
+Memory organization where consecutive addresses are distributed across multiple memory banks for parallel access. Prevents memory access conflicts ([bank conflicts](#bank-conflict)) and improves bandwidth.
+
+### Global Memory
+The main device memory accessible by all threads, offering high capacity but higher latency than shared memory.
+
+### Pinned Memory
+Host memory that is page-locked to accelerate transfers between CPU and GPU, reducing overhead for large data movements.
+
+### Dense Tensor
+A tensor in which most elements are nonzero, typically stored in a contiguous block of memory.
+
+### Sparse Tensor
+A tensor in which most elements are zero, allowing for memory and computation optimizations by storing only nonzero values and their indices.
+
+### Host
+CPU and main memory system that manages GPU execution. Launches kernels, transfers data, and coordinates overall computation.
+
+### Device
+GPU hardware that executes parallel kernels. Contains compute units, memory hierarchy, and specialized accelerators.
+
+---
+
+## 2. GPU Programming Model
+
+### Thread / Work-item
+AMD's work-item is the smallest unit of parallel execution, each running an independent instruction stream on a single data element. This is equivalent to NVIDIA's thread. Work-items/threads are grouped into [wavefronts (AMD)](#warp--wavefront) and [warps (NVIDIA)](#warp--wavefront) for efficient scheduling and resource sharing.
+
+### Warp / Wavefront
+AMD's wavefront is a group of threads that run instructions in lockstep, forming the SIMD group. This is equivalent to NVIDIA's warp.
+
+### Thread Block / Work Group
+AMD's work group is a collection of threads/work-items that can synchronize and share memory. This is equivalent to NVIDIA's thread block. Work groups/thread blocks are scheduled independently and mapped to hardware units for execution.
+
+### Grid
+The complete collection of all work groups (thread blocks) that execute a kernel. A grid spans the entire computational domain and is organized in 1D, 2D, or 3D dimensions. Each work group within the grid operates independently and can be scheduled on different compute units, enabling massive parallel execution across the entire GPU.
+
+### Block Size
+Number of work-items/threads in a compute unit (CU). Determines work group size and memory usage.
+
+### Single-Instruction, Multi-Thread (SIMT) / Single-Instruction, Multi-Data (SIMD)
+SIMT (Single-Instruction, Multi-Thread) allows threads in a warp to diverge, while SIMD (Single-Instruction, Multi-Data) enforces strict lockstep execution within wavefronts. These models define how parallelism is expressed and managed on different architectures.
+
+### Occupancy
+The ratio of active warps/wavefronts to the maximum number of warps/wavefronts supported by a hardware unit. Affects the ability to hide memory latency and maximize throughput.
+
+---
+
+## 3. Kernel Structure
+
+### Kernel
+A function executed on the GPU, typically written in [HIP](#hip) or [CUDA](#cuda), that performs parallel computations over input data. Kernels are launched with specific grid and block dimensions to map computation to hardware. In CK, kernels are composed from pipelines and require a pipeline, tile partitioner, and epilogue component.
+
+### Pipeline
+A CK Pipeline orchestrates the sequence of operations for a kernel, including data loading, computation, and storage phases. It consists of two core components: a [Problem](#problem) component that defines what to compute, and a [Policy](#policy) component that specifies how to move data around. 
+
+### Tile Partitioner
+Defines the mapping between problem dimensions (M, N, K) and GPU hierarchy. It specifies workgroup-level tile sizes (kM, kN, kK) and determines grid dimensions by dividing the problem size by tile sizes.
+
+### Problem
+Defines what to compute - input/output shapes, data types, and mathematical operations (e.g., GEMM, convolution).
+
+### Policy
+Defines memory access patterns and hardware-specific optimizations.
+
+### User Customized Tile Pipeline
+User-defined pipeline that combines custom problem and policy components for specialized computations. CK also provides prebuilt pipelines and policies for common operations that can be used as starting points.
+
+### User Customized Tile Pipeline Optimization
+Process of tuning tile sizes, memory access patterns, and hardware utilization for specific workloads. CK also provides prebuilt pipelines and policies for common operations that can be used as starting points.
+
+### Tile Programming API
+CK's high-level interface for defining tile-based computations with predefined hardware mapping for data load/store.
+
+### Coordinate Transformation Primitives
+CK utilities for converting between different coordinate systems (logical, physical, memory layouts).
+
+### Reference Kernel
+A baseline kernel implementation used to verify correctness and performance. CK has two reference kernel implementations: one for CPU and one for GPU.
+
+### Launch Parameters
+Configuration values (e.g., grid size, block size) that determine how a kernel is mapped to hardware resources. Proper tuning of these parameters is essential for optimal performance.
+
+---
+
+## 4. Memory Access and Data Layout
+
+### Memory Coalescing
+An optimization where consecutive threads access consecutive memory addresses, allowing a single memory transaction to serve multiple threads. Proper coalescing is vital for achieving peak memory bandwidth.
+
+### Alignment
+A memory management startegy for efficient memory access where data structures are stored at addresses that are multiples of a specific value.
+
+### Bank Conflict
+Occurs when multiple threads in a warp/wavefront access different addresses mapping to the same shared memory bank, causing serialization and reduced bandwidth.
+
+### Padding
+The addition of extra elements (often zeros) to tensor edges. This is used to control output size in convolution and pooling, or to align data for efficient memory access.
+
+### Permute/Transpose
+Operations that rearrange the order of tensor axes, often required to match kernel input formats or optimize memory access patterns.
+
+### Host-Device Transfer
+The process of moving data between CPU (host) and GPU (device) memory. Host-device transfers can be a performance bottleneck and are optimized using pinned memory and asynchronous operations.
+
+### Stride
+The step size to move from one element to the next in a particular dimension of a tensor or matrix. In convolution and pooling, stride determines how far the kernel moves at each step.
+
+### Dilation
+The spacing between kernel elements in convolution operations, allowing the receptive field to grow without increasing kernel size.
+
+### Im2Col/Col2Im
+Data transformation techniques that convert image data to column format (im2col) for efficient convolution and back (col2im) to reconstruct the original layout.
+
+### Fast Changing Dimension
+Innermost dimension that changes fastest in memory layout.
+
+### Outer Dimension
+Slower-changing dimension in memory layout.
+
+### Inner Dimension
+Faster-changing dimension in memory layout.
+
+---
+
+## 5. Tile-Based Computing and Data Structures
+
+### Tile
+A sub-region of a tensor or matrix processed by a block or thread. Tiles are used to improve memory locality and enable blocking strategies in kernels. Rectangular data blocks are the unit of computation and memory transfer in CK and the basis for tiled algorithms.
+
+### Block Tile
+Memory tile processed by a work group (thread block).
+
+### Wave Tile
+Sub-tile processed by a single wave within a work group. Represents the granularity of SIMD execution.
+
+### Tile Distribution
+Hierarchical data mapping from work-items to data in memory.
+
+### Tile Window
+Viewport into a larger tensor that defines the current tile's position and boundaries for computation.
+
+### Load Tile
+Operation that transfers data from global memory/LDS to per-thread registers using optimized memory access patterns.
+
+### Store Tile
+Operation that transfers data from per-thread registers to LDS/global memory using optimized memory access patterns.
+
+### Descriptor
+Metadata structure that defines tile properties, memory layouts, and coordinate transformations for CK operations.
+
+### Input/Problem Shape
+Dimensions and data types of input tensors that define the computational problem (e.g., M×K, K×N for GEMM).
+
+### Vector
+Smallest data unit processed by individual threads. Typically 4-16 elements depending on data type and hardware.
+
+---
+
+## 6. Kernel Operations and Optimization
+
+### Elementwise
+Operations applied independently to each tensor element, such as addition or multiplication. These are highly parallelizable and benefit from efficient memory access.
+
+### Epilogue
+The final stage of a kernel or operation, often applying activation functions, bias, or other post-processing steps. Epilogues are critical for integrating kernel outputs into larger computation graphs.
+
+### Add+Multiply
+A common fused operation in ML and linear algebra, where an elementwise addition is immediately followed by multiplication, often used for bias and scaling in neural network layers.
+
+---
+
+## 7. Linear Algebra and ML Operations
+
+### General Matrix Multiply (GEMM)
+Core matrix operation in linear algebra and deep learning. A GEMM is defined as C = αAB + βC for matrices A, B, and C. 
+
+### "Vanilla" GEMM (Naive GEMM) Kernel
+The **vanilla GEMM** is the simplest form of GEMM in CK. It:
+- Takes input matrices **A** and **B**
+- Multiplies them to produce output matrix **C**
+
+This is the **baseline** or **building block** GEMM that all other complex versions expand upon.
+
+### Grouped GEMM (GGEMMs)
+
+A kernel which calls multiple VGEMMs. Each call can have a different input shape. Each input shape problem first finds its corresponding kernel and then data is mapped to the work-group (blocks) of that kernel. 
+
+### Batched GEMM
+A kernel which calls VGEMMs with different "batches" of data. All batches have the same input shape. 
+
+### Split-K GEMM
+A parallelization strategy that partitions the reduction dimension (K) across multiple compute units, increasing parallelism for large matrix multiplications.
+
+### GEMV
+The operation of multiplying a matrix by a vector, producing another vector. GEMV (General Matrix Vector Multiplication) is a core linear algebra primitive, widely used in neural networks and scientific computing.
+
+### Inner Product
+Also known as the dot product, it computes the sum of elementwise products of two vectors, yielding a scalar.
+
+### Outer Product
+The result of multiplying a column vector by a row vector, producing a matrix. Outer products are used in rank-1 updates and some ML algorithms.
+
+### Norm
+A function that measures the magnitude of a vector or matrix, such as L2 (Euclidean) or L1 norm. Norms are used in regularization, normalization, and optimization.
+
+---
+
+## 8. Testing, Build, and Infrastructure
+
+### Regression Test
+Tests that are part of CK's ctest suite and explicitly take more than 30s to finish on gfx942.
+
+### Smoke Test
+Tests that are part of CK's ctest suite and take less than or equal to 30 seconds to finish on gfx942.
+
+---
+
+## 9. Low-Level Instructions and Optimizations
+
+### eXtensible Data Language (XDL) Instructions
+eXtensible Data Language (XDL) instructions are a set of specialized, low-level instructions used to optimize data movement, memory access, and layout in high-performance computing, GPU programming, and deep learning tasks.
+
+---
+
+## 10. Miscellaneous
+
+### HIP
+AMD's Heterogeneous-Computing Interface for Portability, a C++ runtime API and programming language that enables developers to create portable applications for AMD and NVIDIA GPUs. HIP provides a familiar CUDA-like programming model while maintaining compatibility across different GPU architectures.
+
+### CUDA
+NVIDIA's Compute Unified Device Architecture, a parallel computing platform and programming model for NVIDIA GPUs. CUDA provides a C++ extension for writing GPU kernels and managing GPU resources.
+
+### ROCm
+AMD's Radeon Open Compute platform, an open-source software stack for GPU computing that includes [HIP](#hip), libraries, and tools for high-performance computing and machine learning workloads on AMD GPUs.
+
+---
+
+## Scientific Context and References
+
+This terminology is grounded in parallel computing theory, numerical linear algebra, and computer architecture. For further reading, see:
+- [Building Efficient GEMM Kernels with CK Tile](https://rocm.blogs.amd.com/software-tools-optimization/building-efficient-gemm-kernels-with-ck-tile-vendo/README.html)
+- [CK Tile Flash](https://rocm.blogs.amd.com/software-tools-optimization/ck-tile-flash/README.html)
+
+This document assumes familiarity with parallel computing, linear algebra, and computer architecture principles.
--- a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd_ngchw.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd_ngchw.cpp
@@ -107,14 +107,14 @@ int execute_conv_fwd()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                        out.GetDeviceBuffer(),
                                                        in_lengths,
                                                        in_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                        out_lengths,
                                                        out_strides,
                                                        filter_strides,
--- a/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp
+++ b/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp
@@ -130,14 +130,14 @@ int main()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                        in.GetDeviceBuffer(),
                                                        out_lengths,
                                                        out_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                        in_lengths,
                                                        in_strides,
                                                        filter_strides,
--- a/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp
+++ b/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp
@@ -105,14 +105,14 @@ int main()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                        in.GetDeviceBuffer(),
                                                        out_lengths,
                                                        out_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                        in_lengths,
                                                        in_strides,
                                                        filter_strides,
--- a/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp
+++ b/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp
@@ -109,14 +109,14 @@ int main()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                        in.GetDeviceBuffer(),
                                                        out_lengths,
                                                        out_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                        in_lengths,
                                                        in_strides,
                                                        filter_strides,
--- a/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp
+++ b/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp
@@ -111,14 +111,14 @@ int main()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                        in.GetDeviceBuffer(),
                                                        out_lengths,
                                                        out_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                        in_lengths,
                                                        in_strides,
                                                        filter_strides,
--- a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
+++ b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
@@ -59,7 +59,7 @@ int main()
    SimpleDeviceMem y_dev_buf(sizeof(YDataType) * mn_size);

    std::array<const void*, 2> ab_input               = {a_dev_buf.GetDeviceBuffer(),
-                                           b_dev_buf.GetDeviceBuffer()};
+                                                         b_dev_buf.GetDeviceBuffer()};
    std::vector<ck::index_t> abStride                 = {Stride, 1};
    std::array<std::vector<ck::index_t>, 2> abStrides = {abStride, abStride};

--- a/client_example/15_reduce/reduce_nhwc_c.cpp
+++ b/client_example/15_reduce/reduce_nhwc_c.cpp
@@ -68,15 +68,15 @@ int main(int argc, char* argv[])
    SimpleDeviceMem out(sizeof(OutDataType) * num_out_elements);

    using DeviceOp     = ck::tensor_operation::device::DeviceReduce<InDataType,
-                                                                AccDataType,
-                                                                OutDataType,
-                                                                Rank,
-                                                                NumReduceDim,
-                                                                ReduceAdd,
-                                                                PassThrough,
-                                                                UnaryDivide,
-                                                                PropagateNan,
-                                                                OutputIndex>;
+                                                                    AccDataType,
+                                                                    OutDataType,
+                                                                    Rank,
+                                                                    NumReduceDim,
+                                                                    ReduceAdd,
+                                                                    PassThrough,
+                                                                    UnaryDivide,
+                                                                    PropagateNan,
+                                                                    OutputIndex>;
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

--- a/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp
@@ -117,14 +117,14 @@ int execute_conv_bwd_data_bilinear()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
-                                                        {in.GetDeviceBuffer()},
+                                                          {in.GetDeviceBuffer()},
                                                        in.GetDeviceBuffer(),
                                                        out_lengths,
                                                        out_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {in_lengths},
-                                                        {in_strides},
+                                                          {in_lengths},
+                                                          {in_strides},
                                                        in_lengths,
                                                        in_strides,
                                                        filter_strides,
--- a/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp
@@ -116,14 +116,14 @@ int execute_conv_bwd_data_scale()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                        in.GetDeviceBuffer(),
                                                        out_lengths,
                                                        out_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                        in_lengths,
                                                        in_strides,
                                                        filter_strides,
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp
@@ -121,14 +121,14 @@ int execute_conv_fwd_bilinear()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
-                                                        {out.GetDeviceBuffer()},
+                                                          {out.GetDeviceBuffer()},
                                                        out.GetDeviceBuffer(),
                                                        in_lengths,
                                                        in_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {out_lengths},
-                                                        {out_strides},
+                                                          {out_lengths},
+                                                          {out_strides},
                                                        out_lengths,
                                                        out_strides,
                                                        filter_strides,
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
@@ -222,13 +222,13 @@ bool run_grouped_conv_fwd_convscale_reduce(
                                        ck::tensor_operation::element_wise::Scale{scale_wei},
                                        {}};
    auto conv_ok        = ConvolutionScale<InDataType,
-                                    WeiDataType,
-                                    ConvOutDataType,
-                                    ConvElementOp,
-                                    InLayout,
-                                    WeiLayout,
-                                    OutLayout,
-                                    NumDimSpatial>(in,
+                                           WeiDataType,
+                                           ConvOutDataType,
+                                           ConvElementOp,
+                                           InLayout,
+                                           WeiLayout,
+                                           OutLayout,
+                                           NumDimSpatial>(in,
                                                   wei,
                                                   conv_out,
                                                   elementwise_op,
@@ -717,15 +717,15 @@ bool TensorFullReduction(SimpleDeviceMem& tensor,
    {
        std::cout << "\nReduction of spatial dimensions:" << std::endl;
        using DeviceOp     = ck::tensor_operation::device::DeviceReduce<OutDataType,
-                                                                    OutDataType,
-                                                                    OutDataType,
-                                                                    NumDimSpatial,
-                                                                    NumDimSpatial,
-                                                                    ReduceOperation,
-                                                                    PassThrough,
-                                                                    AccElementwiseOperation,
-                                                                    true,   // PropagateNan
-                                                                    false>; // OutputIndex
+                                                                        OutDataType,
+                                                                        OutDataType,
+                                                                        NumDimSpatial,
+                                                                        NumDimSpatial,
+                                                                        ReduceOperation,
+                                                                        PassThrough,
+                                                                        AccElementwiseOperation,
+                                                                        true,   // PropagateNan
+                                                                        false>; // OutputIndex
        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
            DeviceOp>::GetInstances();

--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
@@ -120,14 +120,14 @@ int execute_conv_fwd_scale()
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                        out.GetDeviceBuffer(),
                                                        in_lengths,
                                                        in_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                        out_lengths,
                                                        out_strides,
                                                        filter_strides,
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
@@ -129,8 +129,8 @@ int execute_conv_fwd_scaleadd_ab()
                                                        in_strides,
                                                        wei_lengths,
                                                        wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                        out_lengths,
                                                        out_strides,
                                                        filter_strides,
--- a/client_example/25_wrapper/wrapper_img2col.cpp
+++ b/client_example/25_wrapper/wrapper_img2col.cpp
@@ -132,9 +132,9 @@ void PerformImageToColumnPad0(const ck::index_t G,
                                                                  ck::wrapper::size<0>(tile_shape));

    const auto kernel    = DeviceImageToColumnPad0<decltype(input_tensor_global),
-                                                decltype(output_tensor_global),
-                                                decltype(tile_shape),
-                                                decltype(thread_layout)>;
+                                                   decltype(output_tensor_global),
+                                                   decltype(tile_shape),
+                                                   decltype(thread_layout)>;
    const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
                                                  kernel,
                                                  dim3(grid_size_x, grid_size_y, 1),
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.15)
 project(ck_app)
-add_compile_options(-std=c++17)
+add_compile_options(-std=c++20)

 if (DTYPES)
    add_definitions(-DDTYPES)
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -22,7 +22,7 @@ file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS

 add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)

-add_compile_options(-std=c++17)
+add_compile_options(-std=c++20)

 file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
 # TODO: Use object library
--- a/codegen/include/ck/host/stringutils.hpp
+++ b/codegen/include/ck/host/stringutils.hpp
@@ -91,8 +91,9 @@ inline auto Transform(const Range& r, F f) -> std::vector<decltype(f(*r.begin())
 }

 template <class Range1, class Range2, class F>
-inline auto Transform(const Range1& r1, const Range2& r2, F f)
-    -> std::vector<decltype(f(*r1.begin(), *r2.begin()))>
+inline auto Transform(const Range1& r1,
+                      const Range2& r2,
+                      F f) -> std::vector<decltype(f(*r1.begin(), *r2.begin()))>
 {
    std::vector<decltype(f(*r1.begin(), *r2.begin()))> result;
    assert(std::distance(r1.begin(), r1.end()) == std::distance(r2.begin(), r2.end()));
--- a/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
@@ -142,12 +142,11 @@ std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> Operation_Conv_Fwd_Xdl_Cshuffle::Cr
        x.A                = TensorDesc{prob.ADataType, prob.ALayout};
        x.B                = TensorDesc{prob.BDataType, prob.BLayout};
        x.E                = TensorDesc{prob.EDataType, prob.ELayout};
-        x.Ds               = Transform(prob.DsLayout, prob.DsDataType, [](auto lo, auto dt) {
-            return TensorDesc{dt, lo};
-        });
-        x.a_elem_op        = prob.AElementOp;
-        x.b_elem_op        = prob.BElementOp;
-        x.cde_elem_op      = prob.CDEElementOp;
+        x.Ds               = Transform(
+            prob.DsLayout, prob.DsDataType, [](auto lo, auto dt) { return TensorDesc{dt, lo}; });
+        x.a_elem_op   = prob.AElementOp;
+        x.b_elem_op   = prob.BElementOp;
+        x.cde_elem_op = prob.CDEElementOp;
        x.update_prologue(prologue);
        x.update_epilogue(epilogue);
        result.push_back(x);
--- a/codegen/test/batched_gemm_softmax_gemm.cpp
+++ b/codegen/test/batched_gemm_softmax_gemm.cpp
@@ -55,12 +55,12 @@ TEST_CASE(test_problem_kernel)
        std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
        auto&& solution = solutions[i];
        auto src        = ck::host::InterpolateString(gemm_compile_check,
-                                               {{"include", prob.GetIncludeHeader()},
-                                                {"template", solution.ToTemplateString()},
-                                                {"m", std::to_string(prob.M)},
-                                                {"n", std::to_string(prob.N)},
-                                                {"k", std::to_string(prob.K)},
-                                                {"o", std::to_string(prob.O)}});
+                                                      {{"include", prob.GetIncludeHeader()},
+                                                       {"template", solution.ToTemplateString()},
+                                                       {"m", std::to_string(prob.M)},
+                                                       {"n", std::to_string(prob.N)},
+                                                       {"k", std::to_string(prob.K)},
+                                                       {"o", std::to_string(prob.O)}});
        auto srcs       = get_headers_for_test();
        srcs.push_back({"main.cpp", src});
        rtc::compile_options options;
--- a/codegen/test/gemm_multiple_d.cpp
+++ b/codegen/test/gemm_multiple_d.cpp
@@ -60,11 +60,11 @@ TEST_CASE(test_problem_kernel)
        std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
        auto&& solution = solutions[i];
        auto src        = ck::host::InterpolateString(gemm_compile_check,
-                                               {{"include", prob.GetIncludeHeader()},
-                                                {"template", solution.ToTemplateString()},
-                                                {"m", std::to_string(prob.M)},
-                                                {"n", std::to_string(prob.N)},
-                                                {"k", std::to_string(prob.K)}});
+                                                      {{"include", prob.GetIncludeHeader()},
+                                                       {"template", solution.ToTemplateString()},
+                                                       {"m", std::to_string(prob.M)},
+                                                       {"n", std::to_string(prob.N)},
+                                                       {"k", std::to_string(prob.K)}});
        auto srcs       = get_headers_for_test();
        srcs.push_back({"main.cpp", src});
        rtc::compile_options options;
--- a/codegen/test/rtc/include/rtc/tmp_dir.hpp
+++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp
@@ -16,7 +16,7 @@ struct tmp_dir

    void execute(const std::string& cmd) const;

-    tmp_dir(tmp_dir const&) = delete;
+    tmp_dir(tmp_dir const&)            = delete;
    tmp_dir& operator=(tmp_dir const&) = delete;

    ~tmp_dir();
--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
@@ -94,7 +94,7 @@ kernel clang_compile_kernel(const std::vector<src_file>& srcs, compile_options o
    assert(not srcs.empty());
    tmp_dir td{"compile"};
    options.flags += " -I. -O3";
-    options.flags += " -std=c++17";
+    options.flags += " -std=c++20";
    options.flags += " --offload-arch=" + get_device_name();
    std::string out;

@@ -278,7 +278,7 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(const std::vector<src
 static kernel hiprtc_compile_kernel(const std::vector<src_file>& srcs, compile_options options)
 {
    options.flags += " -I. -O3";
-    options.flags += " -std=c++17";
+    options.flags += " -std=c++20";
    options.flags += " -DCK_CODE_GEN_RTC";
    options.flags += " --offload-arch=" + get_device_name();
    auto cos = compile_hip_src_with_hiprtc(srcs, options);
--- a/docs/install/Composable-Kernel-prerequisites.rst
+++ b/docs/install/Composable-Kernel-prerequisites.rst
@@ -29,4 +29,4 @@ The following prerequisites are required to build and install Composable Kernel:
 * zlib1g-dev  
 * libzstd-dev  
 * openssh-server  
-* clang-format-12  
+* clang-format-18
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -128,3 +128,5 @@ add_example_executable(example_gemm_wmma_fp16_pk_i4_v3 gemm_wmma_fp16_pk_i4_v3.c
 add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_pk_i4_v3)
 add_example_executable(example_gemm_wmma_fp16_fp8_v3 gemm_wmma_fp16_fp8_v3.cpp)
 add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_fp8_v3)
+add_example_executable(example_gemm_wmma_fp16_pk_i4_v3_b_scale gemm_wmma_fp16_pk_i4_v3_b_scale.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_pk_i4_v3_b_scale)
--- a/example/01_gemm/gemm_wmma_fp16_pk_i4_v3_b_scale.cpp
+++ b/example/01_gemm/gemm_wmma_fp16_pk_i4_v3_b_scale.cpp
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::pk_i4_t;
+using BScaleDataType   = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr bool PermuteA = false;
+static constexpr bool PermuteB = true;
+
+static constexpr ck::index_t Scale_Block_N = 1;
+static constexpr ck::index_t Scale_Block_K = 128;
+
+static constexpr ck::index_t KPerBlock = 64;
+
+// clang-format off
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_BScale_Wmma_CShuffleV3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, BScaleDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        256, Scale_Block_N, Scale_Block_K,
+        128, 128,
+        KPerBlock, 8, 8,
+        16,  16,
+        4,    2,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        S<2, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        1, 1, S<1, 32, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3,
+        CDataType, CDataType, PermuteA, PermuteB>;
+
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        AccDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K;
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BScaleDataType> b1_k_n(f_host_tensor_descriptor((K + Scale_Block_K - 1) / Scale_Block_K,
+                                                           (N + Scale_Block_N - 1) / Scale_Block_N,
+                                                           Scale_Stride_BN,
+                                                           BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 4:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    case 5:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.5, 0.5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
+    DeviceMem b1_scale_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // weight permute
+    if constexpr(PermuteB)
+    {
+        int K1 = KPerBlock;
+        int K0 = K / KPerBlock;
+
+        // int K0, N, K1
+        for(int j = 0; j < K0; j++)
+        {
+            for(int i = 0; i < N; i++)
+            {
+                for(int jj = 0; jj < K1; jj++)
+                {
+                    b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j++)
+            {
+                b_k_n_permute(i * K + j) = b_k_n(i * K + j);
+            }
+        }
+    }
+
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int hi   = input[2];
+                int lo   = input[0];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 0, i) = i4x2;
+            }
+
+            {
+                int hi   = input[6];
+                int lo   = input[4];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 2, i) = i4x2;
+            }
+
+            {
+                int hi   = input[3];
+                int lo   = input[1];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 4, i) = i4x2;
+            }
+
+            {
+                int hi   = input[7];
+                int lo   = input[5];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 6, i) = i4x2;
+            }
+        }
+    }
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
+    b1_scale_device_buf.ToDevice(b1_k_n.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm      = DeviceGemmV2Instance{};
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument =
+        gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                          static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          StrideA,
+                          StrideB,
+                          StrideC,
+                          Scale_Stride_BN,
+                          static_cast<BScaleDataType*>(b1_scale_device_buf.GetDeviceBuffer()),
+                          KBatch,
+                          a_element_op,
+                          b_element_op,
+                          c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    std::string device_name = ck::get_device_name();
+    if(!(device_name.find("gfx11") != std::string::npos ||
+         device_name.find("gfx12") != std::string::npos))
+    {
+        std::cout << "This kernel support gfx1100 and gfx1200 only" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        Tensor<float> b_k_n_dequant({K, N});
+
+        float v_b = 0;
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                ck::pk_i4_t i4x2 = b_k_n(k, n).data;
+                int8_t i4        = 0;
+                if(k % 2 == 1)
+                    i4 = (i4x2.data >> 0) & 0xf;
+                else
+                    i4 = (i4x2.data >> 4) & 0xf;
+                i4  = i4 - 8;
+                v_b = ck::type_convert<float>(i4);
+
+                b_k_n_dequant(k, n) =
+                    ck::type_convert<float>(v_b) *
+                    ck::type_convert<float>(b1_k_n(k / Scale_Block_K, n / Scale_Block_N));
+            }
+        }
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n_dequant, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+    return pass;
+}
+
+bool run_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -31,15 +31,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 #else
         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
 #endif
-    // clang-format on
+// clang-format on

-    using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                            BDataType,
-                                                                            CDataType,
-                                                                            AccDataType,
-                                                                            AElementOp,
-                                                                            BElementOp,
-                                                                            CElementOp>;
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

 using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
                                                                             BLayout,
--- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -56,10 +56,10 @@ using CDataType   = float;
 using AccDataType = float;

 #endif
-    // clang-format on
+// clang-format on

-    using ReferenceGemmInstance = ck::tensor_operation::host::
-        ReferenceGemm<ADataType, BDataType, CDataType, float, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, float, AElementOp, BElementOp, CElementOp>;

 template <typename DataType>
 std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
@@ -117,7 +117,7 @@ int reduce_blockwise_impl(bool do_verification,
    using InOutDataTypeInDevice = typename std::
        conditional<std::is_same<InOutDataType, int4_t>::value, int8_t, InOutDataType>::type;
 #else
-    using InOutDataTypeInDevice   = InOutDataType;
+    using InOutDataTypeInDevice = InOutDataType;
 #endif

    using DeviceReduceInstance =
--- a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
@@ -175,15 +175,15 @@ auto run_gemm_reduce_max_xdl(ck::index_t M,
    auto invoker   = device_op.MakeInvoker();
    auto argument  = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
                                           b_device_buf.GetDeviceBuffer(),
-                                           {},
+                                            {},
                                           e_device_buf.GetDeviceBuffer(),
-                                           {r0_device_buf.GetDeviceBuffer()},
+                                            {r0_device_buf.GetDeviceBuffer()},
                                           M,
                                           N,
                                           K,
                                           StrideA,
                                           StrideB,
-                                           {},
+                                            {},
                                           StrideE,
                                           a_element_op,
                                           b_element_op,
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -207,7 +207,7 @@ int main(int argc, char* argv[])
    auto argument     = batched_gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
                                              b_device_buf.GetDeviceBuffer(),
                                              nullptr,
-                                              {},
+                                                  {},
                                              c_device_buf.GetDeviceBuffer(),
                                              p_reduces,
                                              M,
@@ -216,9 +216,9 @@ int main(int argc, char* argv[])
                                              StrideA,
                                              StrideB,
                                              StrideC,
-                                              {},
+                                                  {},
                                              gemm_element_ops,
-                                              {},
+                                                  {},
                                              reduce_in_element_ops,
                                              reduce_out_element_ops,
                                              BatchCount);
--- a/example/27_layernorm2d_fwd/run_layernorm_example.inc
+++ b/example/27_layernorm2d_fwd/run_layernorm_example.inc
@@ -44,9 +44,9 @@ int run_layernorm2d_fwd_example()
        {0, 1},
        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
        {1},
        1e-4,
        x_dev.GetDeviceBuffer(),
--- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
@@ -126,10 +126,10 @@ int run(int argc, char* argv[])

        if(i < 4)
        {
-            std::cout << "a_gs_ms_ks[" << i << "]: " << a_gs_ms_ks.mDesc << ", "
-                      << "b0_gs_ns_ks[" << i << "]: " << b0_gs_ns_ks.mDesc << ", "
-                      << "b1_gs_os_ns[" << i << "]: " << b1_gs_os_ns.mDesc << ", "
-                      << "c_gs_ms_os[" << i << "]: " << c_gs_ms_os_device_result.mDesc << std::endl;
+            std::cout << "a_gs_ms_ks[" << i << "]: " << a_gs_ms_ks.mDesc << ", " << "b0_gs_ns_ks["
+                      << i << "]: " << b0_gs_ns_ks.mDesc << ", " << "b1_gs_os_ns[" << i
+                      << "]: " << b1_gs_os_ns.mDesc << ", " << "c_gs_ms_os[" << i
+                      << "]: " << c_gs_ms_os_device_result.mDesc << std::endl;
        }

        switch(init_method)
--- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
@@ -129,11 +129,11 @@ int main()
        auto argument_ptr    = device_instance.MakeArgumentPointer(
            out_dev.GetDeviceBuffer(),
            {ck::type_convert<EmbType*>(emb_a_dev.GetDeviceBuffer()),
-             ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
-             ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
+                ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
+                ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
            {ck::type_convert<IndexType*>(index_a_dev.GetDeviceBuffer()),
-             ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
-             ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
+                ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
+                ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
            gamma_dev.GetDeviceBuffer(),
            beta_dev.GetDeviceBuffer(),
            current_dim,
--- a/example/39_permute/common.hpp
+++ b/example/39_permute/common.hpp
@@ -249,8 +249,8 @@ inline auto to_array(Range& range) noexcept
 }

 template <typename Axes>
-inline auto is_valid_axes(const Axes& axes)
-    -> std::enable_if_t<detail::is_random_access_range_v<Axes>, bool>
+inline auto
+is_valid_axes(const Axes& axes) -> std::enable_if_t<detail::is_random_access_range_v<Axes>, bool>
 {
    using std::empty;
    if(empty(axes))
@@ -357,10 +357,11 @@ auto extend_axes(const Problem::Axes& axes)
 }

 template <typename Shape, typename Indices>
-auto advance_indices(const Shape& shape, Indices& indices) -> std::enable_if_t<
-    detail::is_bidirectional_range_v<Shape> && detail::is_sized_range_v<Shape> &&
-        detail::is_bidirectional_range_v<Indices> && detail::is_sized_range_v<Indices>,
-    bool>
+auto advance_indices(const Shape& shape, Indices& indices)
+    -> std::enable_if_t<
+        detail::is_bidirectional_range_v<Shape> && detail::is_sized_range_v<Shape> &&
+            detail::is_bidirectional_range_v<Indices> && detail::is_sized_range_v<Indices>,
+        bool>
 {
    using std::size;
    if(!(is_valid_shape(shape) && is_valid_indices(shape, indices) && size(shape) == size(indices)))
--- a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
+++ b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
@@ -65,9 +65,9 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
        {0, 0, 0, C, 1},
        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
        {1, 2, 4}, // reduction dimension: [H, W, C]
        1e-6,
        x_dev.GetDeviceBuffer(),
--- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
@@ -152,7 +152,7 @@ int main(int argc, char* argv[])

    std::array<const void*, 1> inputs = {input_dev_buf.GetDeviceBuffer()};
    std::array<void*, 2> outputs      = {output_scaled_casted_transposed_dev_buf.GetDeviceBuffer(),
-                                    output_scaled_casted_dev_buf.GetDeviceBuffer()};
+                                         output_scaled_casted_dev_buf.GetDeviceBuffer()};

    std::cout << "Input: " << input.mDesc << std::endl;
    std::cout << "Scale: " << scale << std::endl;
@@ -164,8 +164,8 @@ int main(int argc, char* argv[])
    auto launch_transpose_scale = [&]() {
        auto transposeScale = DeviceElementwisePermuteInstance{};
        auto argument       = transposeScale.MakeArgumentPointer(dims,
-                                                           {in_strides},
-                                                           {out_strides, in_strides},
+                                                                 {in_strides},
+                                                                 {out_strides, in_strides},
                                                           inputs,
                                                           outputs,
                                                           ScalePassThrough{scale});
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
@@ -213,7 +213,7 @@ int main(int argc, char* argv[])
    auto invoker   = device_op.MakeInvoker();
    auto argument  = device_op.MakeArgument(
        std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
-                                   a1_device_buf.GetDeviceBuffer()},
+                                    a1_device_buf.GetDeviceBuffer()},
        std::array<const void*, 1>{b_device_buf.GetDeviceBuffer()},
        std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
        e_device_buf.GetDeviceBuffer(),
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp
@@ -194,9 +194,9 @@ int main(int argc, char* argv[])
    auto invoker   = device_op.MakeInvoker();
    auto argument  = device_op.MakeArgument(
        std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
-                                   a1_device_buf.GetDeviceBuffer()},
+                                    a1_device_buf.GetDeviceBuffer()},
        std::array<const void*, 2>{b0_device_buf.GetDeviceBuffer(),
-                                   b1_device_buf.GetDeviceBuffer()},
+                                    b1_device_buf.GetDeviceBuffer()},
        std::array<const void*, 0>{},
        e_device_buf.GetDeviceBuffer(),
        std::array<std::vector<ck::index_t>, 2>{a0_ms_ks_lengths, a1_ms_ks_lengths},
--- a/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
+++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
@@ -265,10 +265,10 @@ bool run_grouped_conv_fwd(bool do_verification,
    auto device_ew_scale = DeviceElementwiseScale{};
    auto scale_invoker   = device_ew_scale.MakeInvoker();
    auto scale_argument  = device_ew_scale.MakeArgument(e_g_n_k_wos_lengths,
-                                                       {e_g_n_k_wos_strides},
-                                                       {e_g_n_k_wos_strides},
-                                                       {conv_device_buf.GetDeviceBuffer()},
-                                                       {out_device_buf.GetDeviceBuffer()},
+                                                        {e_g_n_k_wos_strides},
+                                                        {e_g_n_k_wos_strides},
+                                                        {conv_device_buf.GetDeviceBuffer()},
+                                                        {out_device_buf.GetDeviceBuffer()},
                                                       scale_convert);

    if(!device_ew_scale.IsSupportedArgument(scale_argument))
--- a/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+++ b/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
@@ -46,9 +46,9 @@ int run_layernorm4d_fwd_example()
        {0, W * C, C, 1},
        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
        {1, 2, 3},
        1e-4,
        x_dev.GetDeviceBuffer(),
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -31,7 +31,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
            example_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
            example_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
        endif()
-        set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+        set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1")
        example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
        example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
        example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
@@ -39,22 +39,22 @@ foreach(gpu IN LISTS GPU_TARGETS)
    endif()
 endforeach()

-set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1")
 set(BLOCKSCALE_GEMM_OPTIONS )
 check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP)
 check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION)

 if(hip_VERSION_FLAT LESS 600443483 OR hip_VERSION_FLAT GREATER_EQUAL 700000000)
  if(HAS_MISCHED_BOTTOMUP)
-     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
  elseif(HAS_MISCHED_PRERA_DIRECTION)
-     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
  endif()
 else()
  if(HAS_MISCHED_BOTTOMUP)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1")
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-bottomup=1")
  elseif(HAS_MISCHED_PRERA_DIRECTION)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-prera-direction=bottomup")
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-prera-direction=bottomup")
  endif()
 endif()

@@ -62,7 +62,6 @@ check_cxx_compiler_flag("-mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupa
 if(HAS_MAX_OCCUPANCY_EXPERIMENTAL)
    list(APPEND BLOCKSCALE_GEMM_OPTIONS -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental)
 endif()
-# list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1")
 example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
 example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
 example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
@@ -357,7 +357,7 @@ int main(int argc, char* argv[])
                int n1 = n % NLane;

                int k0 = k / (KLane * KPack);
-                tempk = k % (KLane * KPack);
+                tempk  = k % (KLane * KPack);
                int k1 = tempk / KPack;
                int k2 = tempk % KPack;

--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -58,7 +58,7 @@ example_compile_options(example_moe_gemm1_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_M
 example_compile_options(example_moe_gemm2_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS})

 set(FP8_MXGEMM_OPTIONS)
-list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1")
 example_compile_options(example_gemm_mx_fp8 PRIVATE ${FP8_MXGEMM_OPTIONS})
 example_compile_options(example_gemm_mx_bf8 PRIVATE ${FP8_MXGEMM_OPTIONS})

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -24,26 +24,27 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    set(result 1)
    if(DEFINED DTYPES)
        foreach(source IN LISTS FILE_NAME)
+            get_filename_component(source_name ${source} NAME)
            set(test 0)
-            if((source MATCHES "_fp16" OR source MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
+            if((source_name MATCHES "_fp16" OR source_name MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
                set(test 1)
            endif()
-            if((source MATCHES "_fp32" OR source MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
+            if((source_name MATCHES "_fp32" OR source_name MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
                set(test 1)
            endif()
-            if((source MATCHES "_fp64" OR source MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
+            if((source_name MATCHES "_fp64" OR source_name MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
                set(test 1)
            endif()
-            if((source MATCHES "_fp8" OR source MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
+            if((source_name MATCHES "_fp8" OR source_name MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
                set(test 1)
            endif()
-            if((source MATCHES "_bf8" OR source MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
+            if((source_name MATCHES "_bf8" OR source_name MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
                set(test 1)
            endif()
-            if((source MATCHES "_bf16" OR source MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
+            if((source_name MATCHES "_bf16" OR source_name MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
                set(test 1)
            endif()
-            if((source MATCHES "_int8" OR source MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
+            if((source_name MATCHES "_int8" OR source_name MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
                set(test 1)
            endif()
            if(test EQUAL 1)
@@ -55,73 +56,65 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)

    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})

-    #Do not build any DL examples if DL_KERNELS not set
    foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+        get_filename_component(source_name ${source} NAME)
+        #Do not build any DL examples if DL_KERNELS not set
+        if(NOT DEFINED DL_KERNELS AND source_name MATCHES "_dl")
            message(DEBUG "removing dl example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
-    endforeach()
-    #Do not build any DPP examples if DPP_KERNELS not set
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp")
+        #Do not build any DPP examples if DPP_KERNELS not set
+        if(NOT DEFINED DPP_KERNELS AND source_name MATCHES "_dpp")
            message(DEBUG "removing dpp example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
-    endforeach()
-    #Do not build any XDL examples if gfx9 targets are not on the list
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
+        #Do not build any XDL examples if gfx9 targets are not on the list
+        if(NOT EX_TARGETS MATCHES "gfx9" AND source_name MATCHES "_xdl")
            message(DEBUG "removing xdl example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
-    endforeach()
-    #Do not build any WMMA examples if gfx11 targets are not on the list
-    foreach(source IN LISTS FILE_NAME)
-	if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
+        #Do not build any WMMA examples if gfx11 targets are not on the list
+        if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source_name MATCHES "_wmma")
            message(DEBUG "removing wmma example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
-    endforeach()
-    #Do not build any microscaling examples if gfx950 target is not on the list
-    foreach(source IN LISTS FILE_NAME)
-	if(NOT EX_TARGETS MATCHES "gfx950" AND source MATCHES "_mx")
+        #Do not build any microscaling examples if gfx950 target is not on the list
+        if(NOT EX_TARGETS MATCHES "gfx950" AND source_name MATCHES "_mx")
            message(DEBUG "removing microscaling example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
-    endforeach()
-    #Do not build any FP8 examples if CK_ENABLE_FP8 not set
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED CK_ENABLE_FP8 AND source MATCHES "_fp8")
+        #Do not build any FP8 examples if CK_ENABLE_FP8 not set
+        if(NOT DEFINED CK_ENABLE_FP8 AND source_name MATCHES "_fp8")
            message(DEBUG "removing fp8 example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
-    endforeach()
-    #Do not build any BF8 examples if CK_ENABLE_BF8 not set
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED CK_ENABLE_BF8 AND source MATCHES "_bf8")
+        #Do not build any BF8 examples if CK_ENABLE_BF8 not set
+        if(NOT DEFINED CK_ENABLE_BF8 AND source_name MATCHES "_bf8")
            message(DEBUG "removing bf8 example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
-    endforeach()
-    # Build fp8 gemm_multiply_multiply and moe only on gfx94/95
-    foreach(source IN LISTS FILE_NAME)
-    if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95")
-        if (source MATCHES "fp8" AND source MATCHES "(gemm_multiply_multiply|moe)")
-            message(DEBUG "Skipping ${source} example for current target")
-            list(REMOVE_ITEM FILE_NAME "${source}")
+        # Build fp8 gemm_multiply_multiply and moe only on gfx94/95
+        if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95")
+            if(source_name MATCHES "fp8" AND source_name MATCHES "(gemm_multiply_multiply|moe)")
+                message(DEBUG "Skipping ${source} example for current target")
+                list(REMOVE_ITEM FILE_NAME "${source}")
+            endif()
        endif()
-    endif()
    endforeach()
    #only continue if there are some source files left on the list
+    set(source_name_list "")
+    foreach(source IN LISTS FILE_NAME)
+        get_filename_component(source_name ${source} NAME)
+        list(APPEND source_name_list ${source_name})
+    endforeach()
    if(FILE_NAME)
-        if(FILE_NAME MATCHES "_xdl" AND NOT FILE_NAME MATCHES "_pk_i4")
+        if(source_name_list MATCHES "_xdl" AND NOT source_name_list MATCHES "_pk_i4")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
-        elseif(FILE_NAME MATCHES "_wmma")
+        elseif(source_name_list MATCHES "_wmma")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
-        elseif(FILE_NAME MATCHES "_mx") #only build mx example for gfx950
+        elseif(source_name_list MATCHES "_mx") #only build mx example for gfx950
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
-        elseif(FILE_NAME MATCHES "_pk_i4") #only build these examples for gfx942 and gfx950
+        elseif(source_name_list MATCHES "_pk_i4") #only build these examples for gfx942 and gfx950
            message(DEBUG "trimming targets for ${FILE_NAME}")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
        endif()
@@ -130,7 +123,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
        target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
        target_link_libraries(${EXAMPLE_NAME} PRIVATE getopt::getopt)
        add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
-        set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS} )
+        set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS})
        add_dependencies(examples ${EXAMPLE_NAME})
        add_dependencies(check ${EXAMPLE_NAME})
        rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
@@ -157,71 +150,71 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    message(DEBUG "adding example ${EXAMPLE_NAME}")
    set(result 1)
    if(DEFINED DTYPES)
-    foreach(source IN LISTS FILE_NAME)
-        set(test 0)
-        if((source MATCHES "_fp16" OR source MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_fp32" OR source MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_fp64" OR source MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_fp8" OR source MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_bf8" OR source MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_bf16" OR source MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_int8" OR source MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if(test EQUAL 1)
-            message(DEBUG "removing example ${source} ")
-            list(REMOVE_ITEM FILE_NAME "${source}")
-        endif()
-    endforeach()
+        foreach(source IN LISTS FILE_NAME)
+            get_filename_component(source_name ${source} NAME)
+            set(test 0)
+            if((source_name MATCHES "_fp16" OR source_name MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_fp32" OR source_name MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_fp64" OR source_name MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_fp8" OR source_name MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_bf8" OR source_name MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_bf16" OR source_name MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_int8" OR source_name MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if(test EQUAL 1)
+                message(DEBUG "removing example ${source} ")
+                list(REMOVE_ITEM FILE_NAME "${source}")
+            endif()
+        endforeach()
    endif()

    set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})

-    #Do not build any DL examples if DL_KERNELS not set
+    set(source_name_list "")
    foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+        get_filename_component(source_name ${source} NAME)
+        #Do not build any DL examples if DL_KERNELS not set
+        if(NOT DEFINED DL_KERNELS AND source_name MATCHES "_dl")
            message(DEBUG "removing dl example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
-    endforeach()
-    #Do not build any XDL examples if gfx9 targets are not on the list
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
+        #Do not build any XDL examples if gfx9 targets are not on the list
+        if(NOT EX_TARGETS MATCHES "gfx9" AND source_name MATCHES "_xdl")
            message(DEBUG "removing xdl example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
-    endforeach()
-    #Do not build any WMMA examples if gfx11 targets are not on the list
-    foreach(source IN LISTS FILE_NAME)
-	if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
+        #Do not build any WMMA examples if gfx11 targets are not on the list
+        if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source_name MATCHES "_wmma")
            message(DEBUG "removing wmma example ${source} ")
            list(REMOVE_ITEM FILE_NAME "${source}")
        endif()
+        list(APPEND source_name_list ${source_name})
    endforeach()
    #only continue if there are some source files left on the list
    if(FILE_NAME)
-        if(FILE_NAME MATCHES "_xdl")
+        if(source_name_list MATCHES "_xdl")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
-        elseif(FILE_NAME MATCHES "_wmma")
+        elseif(source_name_list MATCHES "_wmma")
            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
        endif()
        set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
        add_executable(${EXAMPLE_NAME} ${FILE_NAME})
        target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
        add_dependencies(examples ${EXAMPLE_NAME})
-        set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS} )
+        set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS})
        rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
        set(result 0)
    endif()
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -28,12 +28,14 @@ string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
 set(FMHA_FWD_CODE_GEN_COMMON_ARGS
  ${CMAKE_CURRENT_LIST_DIR}/generate.py
  --api ${FMHA_FWD_APIS}
-  # --filter fmha_fwd_d64_fp16_batch_b16x32x64x64x32x64_r1x1x1_r1x1x1_w16x16x32_w16x16x32_qr_async_trload_vr_npad_nlogits_nbias_nmask_nlse_ndropout_nskip_nsquant_trload
-  )
+  --optdim 32,64,128,256
+  # --filter fmha_fwd...
+)
 set(FMHA_BWD_CODE_GEN_COMMON_ARGS
  ${CMAKE_CURRENT_LIST_DIR}/generate.py
  --api bwd
  --receipt 3
+  --optdim 32,64,128,256
  # --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd...
 )

--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation

 import copy
@@ -7,22 +7,14 @@ from dataclasses import dataclass
 import fnmatch
 import itertools
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Dict, Literal
+from collections import defaultdict

 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
+from codegen.utils import update_file


-BWD_DQDKDV_PIPELINE_MAP = {
-    "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP",
-    "kr_ktr_vr"      : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR",
-}
-
-BWD_DQDKDV_PIPELINE_ENUM_MAP = {
-    "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP",
-    "kr_ktr_vr"      : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR",
-}
-
 FMHA_BWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
 // auto generated by generate.py
@@ -56,8 +48,8 @@ using fmha_bwd_shape_{F_idx} = ck_tile::TileFmhaBwdShape<fmha_block_tile_{F_idx}
                                                         fmha_block_warps2_{F_idx},
                                                         fmha_warp_tile0_{F_idx}>;

-using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
-                                                       {F_skpad},
+using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<false,  /* kPadSeqLenQ */
+                                                       false,  /* kPadSeqLenK */
                                                       {F_dpad},
                                                       {F_dvpad},
                                                       false,
@@ -93,18 +85,18 @@ using fmha_bwd_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdPipelineProblem<
    fmha_dropout_{F_idx},
    fmha_bwd_trait_{F_idx}>;

-using fmha_bwd_pipeline_{F_idx} = {F_pipeline}<fmha_bwd_pipeline_problem_{F_idx}>;
+using fmha_bwd_pipeline_{F_idx} = ck_tile::BlockFmhaBwdDQDKDVPipeline<fmha_bwd_pipeline_problem_{F_idx}>;

 using fmha_bwd_dk_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
    ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                      typename FmhaBwdTypeConfig<{F_dtype}>::KGradDataType,
-                                      {F_skpad},
+                                      false,
                                      {F_dpad}>>;

 using fmha_bwd_dv_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
    ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                      typename FmhaBwdTypeConfig<{F_dtype}>::VGradDataType,
-                                      {F_skpad},
+                                      false,
                                      {F_dvpad}>>;

 using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
@@ -115,13 +107,10 @@ using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
 using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim},
                                                         {F_dtype},
                                                         {F_mode},
-                                                         {F_pipeline_enum},
                                                         fmha_mask_{F_idx},
                                                         fmha_dropout_{F_idx},
                                                         {F_bias},
                                                         {F_dbias},
-                                                         {F_spad},
-                                                         {F_skpad},
                                                         {F_dpad},
                                                         {F_dvpad},
                                                         {F_deterministic}>;
@@ -195,108 +184,17 @@ FMHA_BWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 """

 FMHA_BWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
-                using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dvpad}>;
-                using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_pipeline_enum}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_spad0}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_deterministic}>;
-                using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dpad}, {F_deterministic}>;
+                        ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
+                using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
+                using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}>;
+                using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>;
                r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, convert_dq_trait_>(s, a);
                return r;
            }}
 """

-@dataclass
-class FmhaBwdDQDKDVApiTrait:
-    pipeline      : str
-    # sync with fmha_bwd_traits<>, to generate fallback calls
-    hdim          : str
-    dtype         : str  # data type
-    mode          : str  # value from MODE_MAP
-    bm0           : int  # tile size along q seqlen (block size)
-    bn0           : int  # tile size along k seqlen
-    bhdq          : int  # q head_dim
-    bhdv          : int  # v head_dim
-    mask          : str
-    bias          : str
-    dbias         : str
-    dropout       : str
-    spad          : str
-    skpad         : str
-    dpad          : str
-    dvpad         : str
-    deterministic : str
-
-    def scheck(self, spad1 : str) -> str:
-        if self.mode == 'group':
-            return 'true' # always support
-        elif self.spad == 't' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} != 0'
-        elif self.spad == 'f' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} == 0 and a.seqlen_q % 64 != 0'
-        else: # self.skpad == 'f' and skpad1 == 'f'
-            return f'a.seqlen_q % 64 == 0'
-
-    @property
-    def skcheck(self) -> str:
-        if self.mode == 'group':
-            return 'true' # always support
-        elif self.skpad == 't':
-            return f'a.seqlen_k % {self.bn0} != 0'
-        else:
-            return f'a.seqlen_k % {self.bn0} == 0'
-
-    @property
-    def dcheck(self) -> str:
-        if self.dpad == 't': return f'a.hdim_q % {self.bhdq} != 0'
-        else :               return f'a.hdim_q % {self.bhdq} == 0'
-
-    @property
-    def dvcheck(self) -> str:
-        if self.dvpad == 't': return f'a.hdim_v % {self.bhdv} != 0'
-        else :                return f'a.hdim_v % {self.bhdv} == 0'
-
-class FmhaBwdApiPool:
-    def __init__(self, mask_impl):
-        self.dq_dk_dv_pool = dict()
-        self.mask_impl = mask_impl
-
-    def register_dq_dk_dv_traits(self, trait : FmhaBwdDQDKDVApiTrait) -> None:
-        # TODO: do we need to check duplication?
-        if trait.dtype not in self.dq_dk_dv_pool.keys():
-            self.dq_dk_dv_pool[trait.dtype] = dict()
-        if trait.hdim not in self.dq_dk_dv_pool[trait.dtype].keys():
-            self.dq_dk_dv_pool[trait.dtype][trait.hdim] = list()
-
-        self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait))
-
-    @property
-    def api(self) -> str:
-        per_dtypes=str()
-        for i, dtype in enumerate(self.dq_dk_dv_pool.keys()):
-            per_hdim_case=str()
-            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype].keys()):
-                traits=self.dq_dk_dv_pool[dtype][hdim]
-                hdim_int = int(hdim)
-                inners=str()
-                for k, trait in enumerate(traits):
-                    if_k = 'if' if k == 0 else 'else if'
-                    for spad1 in ["t", "f"]:
-                        if (spad1 == "f" and (trait.spad == "t" or trait.mode == "group")):
-                            continue
-                        inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline],
-                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
-                                    F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
-                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype],
-                                    F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                    F_deterministic=BOOL_MAP[trait.deterministic])
-
-                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
-            if_i = 'if' if i == 0 else 'else if'
-            per_dtypes = per_dtypes + FMHA_BWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
-        if not per_dtypes:
-            # empty string we add some ignore to suppress warning in api
-            per_dtypes += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_dtypes)
+# M0 size for 1d kernels (dot/convert)
+M0_1D = 64

 # GEMM0: Q@K=S^T
 # GEMM1: P^T@dO^T=dV(This was chosen as G1 to match fwd, but N1 must be equal to headdim_v)
@@ -304,7 +202,7 @@ class FmhaBwdApiPool:
 # GEMM3: dS^T@Q^T=dK(Similar to G1, but N3 must be equal to headdim_qk)
 # GEMM4: dS@K^T=dQ(N4 must be equal to headdim_qk)
 # Is it necessary to distinguish between K0~K4?
-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdDQDKDVTileSize:
    F_bm0       : int  # tile size along q seqlen (block size)
    F_bn0       : int  # tile size along k seqlen
@@ -337,14 +235,12 @@ class FmhaBwdDQDKDVTileSize:
        f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}_r{self.F_rm2}x{self.F_rn2}x{self.F_rk2}" +\
        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}_o{self.F_occupancy}"

-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdDQDKDVKernel:
    F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
    F_hdim          : int  # hdim
    F_dtype         : str  # data type
    F_tile          : FmhaBwdDQDKDVTileSize
-    F_spad          : str  # true/false
-    F_skpad         : str  #
    F_dpad          : str  #
    F_dvpad         : str  #
    F_bias          : str  #
@@ -353,7 +249,6 @@ class FmhaBwdDQDKDVKernel:
    F_mask          : str  # value from MASK_MAP
    F_mode          : str  # value from MODE_MAP
    F_deterministic : str  #
-    F_pipeline      : str  #
    mask_impl       : str  #

    @property
@@ -387,8 +282,6 @@ class FmhaBwdDQDKDVKernel:
                F_wm1           = self.F_tile.F_wm1,
                F_wn1           = self.F_tile.F_wn1,
                F_wk1           = self.F_tile.F_wk1,
-                F_spad          = BOOL_MAP[self.F_spad],
-                F_skpad         = BOOL_MAP[self.F_skpad],
                F_dpad          = BOOL_MAP[self.F_dpad],
                F_dvpad         = BOOL_MAP[self.F_dvpad],
                F_bias          = BIAS_MAP[self.F_bias],
@@ -398,21 +291,18 @@ class FmhaBwdDQDKDVKernel:
                F_mask          = get_mask_map(self.mask_impl)[self.F_mask],
                F_mode          = MODE_MAP[self.F_mode],
                F_deterministic = BOOL_MAP[self.F_deterministic],
-                F_pipeline_enum = BWD_DQDKDV_PIPELINE_ENUM_MAP[self.F_pipeline],
-                F_pipeline      = BWD_DQDKDV_PIPELINE_MAP[self.F_pipeline])
+            )

    @property
    def name(self) -> str:
        def pad_name() -> str:
            n = ''
-            if self.F_spad == 't': n += 's'
-            if self.F_skpad == 't' : n += 'sk'
            if self.F_dpad == 't' : n += 'd'
            if self.F_dvpad == 't' : n += 'dv'
            if n != '' : n = 'p' + n
            return n
        pn = pad_name()
-        n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name + f'_{self.F_pipeline}'
+        n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name
        if pn != '' : n += f'_{pn}'
        else: n += '_npad'

@@ -440,121 +330,20 @@ class FmhaBwdDQDKDVKernel:
    def filename(self) -> str:
        return self.name + ".cpp"

-    def api_trait(self) -> FmhaBwdDQDKDVApiTrait:
-        return FmhaBwdDQDKDVApiTrait(pipeline=self.F_pipeline,
-                hdim=str(self.F_hdim),
-                dtype=self.F_dtype,
-                mode=self.F_mode,
-                bm0=self.F_tile.F_bm0,
-                bn0=self.F_tile.F_bn0,
-                bhdq=self.F_tile.F_bhdq,
-                bhdv=self.F_tile.F_bhdv,
-                mask=self.F_mask,
-                bias=self.F_bias,
-                dbias=self.F_dbias,
-                dropout=self.F_dropout,
-                spad=self.F_spad,
-                skpad=self.F_skpad,
-                dpad=self.F_dpad,
-                dvpad=self.F_dvpad,
-                deterministic=self.F_deterministic
-                )
-
 # TODO: design a more practical way to do it
-# this is current supported tile size & pipeline.
+# this is current supported tile size.
 def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict]:
    if dtype == 'fp16' or dtype == 'bf16':
        return {
-            '32'  : [FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '64'  : [FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '128' : [FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '256' : [FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"]
+            '32'  : FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            '64'  : FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            '128' : FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            # '160' : FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            '256' : FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
        }
    else:
        return None

-def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaBwdApiPool, List[FmhaBwdDQDKDVKernel]]:
-    # TODO: we don't support tuning yet, so pick up one value for pad
-    #       support this in future
-    gen = list()
-    api_pool = FmhaBwdApiPool(mask_impl)
-
-    for dtype in BWD_DTYPE_MAP.keys():
-        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
-        if d == None:
-            continue
-        for hdim_str, mode, mask, bias, dbias, dropout, spad, skpad, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"], ["t", "f"], ["t", "f"]):
-            tile = d[hdim_str][0]
-            ppl = d[hdim_str][1]
-            hdim = int(hdim_str)
-            if (mode == "group") and (spad == "f" or skpad == "f"):
-                continue
-            if ((bias == "no" or bias == "alibi") and dbias == "t"):
-                continue
-            if ("wg32" in dropout):
-                continue
-            if (dpad == "t" or dvpad == "t"):
-                ppl = d[hdim_str][2]
-            k = FmhaBwdDQDKDVKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype, F_tile=tile,
-                                F_spad=spad, F_skpad=skpad, F_dpad=dpad, F_dvpad=dvpad,
-                                F_bias=bias, F_dbias=dbias, F_dropout=dropout, F_mask=mask, F_mode=mode,
-                                F_pipeline=ppl, mask_impl=mask_impl, F_deterministic=deterministic)
-            if kernel_filter != '':
-                if not fnmatch.fnmatch(k.name, kernel_filter):
-                    continue
-            # Flash attention integration
-            if receipt == 2:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= bias in ['no', 'alibi']
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
-                    if not cond:
-                        continue
-            elif receipt == 3:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= bias in ['no', 'alibi']
-                    cond &= dpad == dvpad
-                    cond &= deterministic == "f"
-                    if not cond:
-                        continue
-            # PyTorch integration
-            elif receipt == 4:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= bias in ['no', 'bias']
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
-                    cond &= mode == 'batch'
-                    cond &= deterministic == "f"
-                    if not cond:
-                        continue
-            # Aiter (mha_bwd) integration
-            elif receipt == 300:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    if not cond:
-                        continue
-            # Aiter (mha_varlen_bwd) integration
-            elif receipt == 400:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    if not cond:
-                        continue
-            # aiter::mha_bwd C++ api integration
-            elif receipt == 600:
-                    cond = dtype in ['fp16', 'bf16']
-                    if not cond:
-                        continue
-            api_pool.register_dq_dk_dv_traits(k.api_trait())
-            gen.append(k)
-
-    return (api_pool, gen)
-
 FMHA_BWD_DOT_DO_O_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};

@@ -565,7 +354,7 @@ using fmha_bwd_dot_do_o_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdOGradDot
    typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
    typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::OGradDataType,
    typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::DDataType,
-    /* BlockSize = */ 64,
+    /* BlockSize = M0 = */ 64,
    {F_hdim},
    {F_mode},
    fmha_bwd_dot_do_o_trait_{F_idx}>;
@@ -613,7 +402,7 @@ std::string fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_{F_idx}>()
 }}
 """

-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdOGradDotOKernel:
    F_idx       : int  # this is not a tunable, but a counter to differentiate symbol
    F_hdim      : int  # hdim
@@ -653,49 +442,6 @@ class FmhaBwdOGradDotOKernel:
    def filename(self) -> str:
        return self.name + ".cpp"

-def get_bwd_dot_do_o_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaBwdOGradDotOKernel]:
-    # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
-    #       support this in future
-    def get_occupancy(dtype, hdim):
-        return 2
-
-    gen = list()
-
-    for dtype in BWD_DTYPE_MAP.keys():
-        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
-        if d == None:
-            continue
-        for hdim_str, mode, spad, dvpad in itertools.product(d.keys(), MODE_MAP.keys(), ["t", "f"], ["t", "f"]):
-            hdim = int(hdim_str)
-            if (mode == "group" and spad == "f"):
-                continue
-            k = FmhaBwdOGradDotOKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype,
-                                F_spad=spad, F_dvpad=dvpad, F_mode=mode,
-                                F_occupancy=get_occupancy(dtype, hdim))
-            if kernel_filter != '':
-                if not fnmatch.fnmatch(k.name, kernel_filter):
-                    continue
-            # Aiter (mha_bwd) integration
-            if receipt == 300:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
-                    if not cond:
-                        continue
-            # Aiter (mha_varlen_bwd) integration
-            elif receipt == 400:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
-                    if not cond:
-                        continue
-            # aiter::mha_bwd C++ api integration
-            elif receipt == 600:
-                    cond = dtype in ['fp16', 'bf16']
-                    if not cond:
-                        continue
-            gen.append(k)
-
-    return gen
-
 FMHA_BWD_CONVERT_DQ_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};

@@ -762,7 +508,7 @@ std::string fmha_bwd_convert_dq_get_name_<convert_dq_trait_{F_idx}>()
 }}
 """

-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdConvertQGradKernel:
    F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
    F_hdim          : int  # hdim
@@ -810,92 +556,235 @@ class FmhaBwdConvertQGradKernel:
    def filename(self) -> str:
        return self.name + ".cpp"

-def get_bwd_convert_dq_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaBwdConvertQGradKernel]:
-    # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
-    #       support this in future
-    def get_occupancy(dtype, hdim):
-        return 2
+@dataclass(frozen=True)
+class FmhaBwdApiTrait:
+    idx           : int  # this is not a tunable, but a counter to differentiate symbol
+    # sync with fmha_bwd_traits<>, to generate fallback calls
+    hdim          : int
+    dtype         : str  # data type
+    mode          : str  # value from MODE_MAP
+    tile          : FmhaBwdDQDKDVTileSize
+    mask          : str
+    bias          : str
+    dbias         : str
+    dropout       : str
+    spad1d        : str # spad for 1d kernels (dot/convert)
+    dpad          : str
+    dvpad         : str
+    deterministic : str
+    mask_impl     : str

-    gen = list()
+    @property
+    def bm0(self) -> int:
+        return self.tile.F_bm0
+    @property
+    def bn0(self) -> int:
+        return self.tile.F_bn0
+    @property
+    def bhdq(self) -> int:
+        return self.tile.F_bhdq
+    @property
+    def bhdv(self) -> int:
+        return self.tile.F_bhdv
+
+    @property
+    def scheck(self) -> str:
+        if self.mode == 'group':
+            return 'true' # always support
+        elif self.spad1d == 't':
+            return f'a.seqlen_q % {M0_1D} != 0'
+        else: # self.spad1d == 'f'
+            return f'a.seqlen_q % {M0_1D} == 0'
+
+    @property
+    def dcheck(self) -> str:
+        if self.dpad == 't': return f'a.hdim_q % {self.bhdq} != 0'
+        else :               return f'a.hdim_q % {self.bhdq} == 0'
+
+    @property
+    def dvcheck(self) -> str:
+        if self.dvpad == 't': return f'a.hdim_v % {self.bhdv} != 0'
+        else :                return f'a.hdim_v % {self.bhdv} == 0'
+
+    @property
+    def dot_do_o_kernel(self) -> FmhaBwdOGradDotOKernel:
+        # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
+        #       support this in future
+        def get_occupancy(dtype, hdim):
+            return 2
+
+        return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1d,
+            F_dvpad=self.dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim))
+
+    @property
+    def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel:
+        return FmhaBwdDQDKDVKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_tile=self.tile,
+            F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias, F_dbias=self.dbias, F_dropout=self.dropout,
+            F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, mask_impl=self.mask_impl)
+
+    @property
+    def convert_dq_kernel(self) -> FmhaBwdConvertQGradKernel:
+        # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
+        #       support this in future
+        def get_occupancy(dtype, hdim):
+            return 2
+
+        return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype,
+            F_bm0=M0_1D, F_bn0=self.tile.F_bn0, F_spad=self.spad1d, F_dpad=self.dpad,
+            F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim),
+            F_deterministic=self.deterministic)
+
+class FmhaBwdApiPool:
+    def __init__(self, mask_impl):
+        self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(list))
+        self.mask_impl = mask_impl
+
+    def register_dq_dk_dv_traits(self, trait : FmhaBwdApiTrait) -> None:
+        # TODO: do we need to check duplication?
+        self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+
+    @staticmethod
+    def if_(i: int) -> str:
+        return 'if' if i == 0 else 'else if'
+
+    def _api_innders(self, traits: List[FmhaBwdApiTrait]) -> str:
+        inners = ""
+        i = 0 
+        for trait in traits:
+            inners += FMHA_BWD_API_INNER_DISPATCH.format(F_if=self.if_(i), F_mode=MODE_MAP[trait.mode],
+                F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
+                F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
+                F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype],
+                F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                F_deterministic=BOOL_MAP[trait.deterministic])
+            i += 1
+        return inners
+
+    @property
+    def api(self) -> str:
+        per_dtypes=str()
+        for i, dtype in enumerate(self.dq_dk_dv_pool):
+            per_hdim_case=str()
+            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype]):
+                traits=self.dq_dk_dv_pool[dtype][hdim]
+                inners = self._api_innders(traits)
+                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=self.if_(j), F_hdim=hdim, F_inner_dispatch=inners)
+            per_dtypes += FMHA_BWD_API_PER_DTYPE.format(F_if=self.if_(i), F_dtype=dtype, F_hdim_case=per_hdim_case)
+        if not per_dtypes:
+            # empty string we add some ignore to suppress warning in api
+            per_dtypes += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_dtypes)
+
+def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[FmhaBwdApiPool, List[FmhaBwdOGradDotOKernel], List[FmhaBwdDQDKDVKernel], List[FmhaBwdConvertQGradKernel]]:
+    if filter_list == '':
+        filter_list = '*@*@*'
+    filter_list = filter_list.split('@')
+    filter_list.extend(['*'] * (3 - len(filter_list)))
+    filter_dot_do_o = filter_list[0]
+    filter_convert_dq = filter_list[1]
+    filter_dq_dk_dv = filter_list[2]
+
+    # use dict as ordered set
+    gen_dot_do_o: Dict[FmhaBwdOGradDotOKernel, Literal[True]] = {}
+    gen_dq_dk_dv: Dict[FmhaBwdDQDKDVKernel, Literal[True]] = {}
+    gen_convert_dq: Dict[FmhaBwdConvertQGradKernel, Literal[True]] = {}
+    api_pool = FmhaBwdApiPool(mask_impl)

    for dtype in BWD_DTYPE_MAP.keys():
        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
-        if d == None:
+        if d is None:
            continue
-        for hdim_str, mode, spad, dpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
+        for hdim_str, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)):
+            tile = d[hdim_str]
            hdim = int(hdim_str)
-            tile = d[hdim_str][0]
-            if (mode == "group" and spad == "f"):
+            if (mode == "group") and (spad1d == "f"):
                continue
-            k = FmhaBwdConvertQGradKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype, F_bm0=64, F_bn0=tile.F_bn0,
-                                F_spad=spad, F_dpad=dpad, F_mode=mode, F_occupancy=get_occupancy(dtype, hdim), F_deterministic=deterministic)
-            if kernel_filter != '':
-                if not fnmatch.fnmatch(k.name, kernel_filter):
+            if ((bias == "no" or bias == "alibi") and dbias == "t"):
+                continue
+            if ("wg32" in dropout):
+                continue
+            t = FmhaBwdApiTrait(idx=0, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad1d=spad1d, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl)
+
+            if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o):
+                continue
+            if not fnmatch.fnmatch(t.dq_dk_dv_kernel.name, filter_dq_dk_dv):
+                continue
+            if not fnmatch.fnmatch(t.convert_dq_kernel.name, filter_convert_dq):
+                continue
+            if optdim_list != [-1]:
+                if hdim not in optdim_list:
+                    continue
+
+            # Flash attention integration
+            if receipt == 2:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= bias in ['no', 'alibi']
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                cond &= dpad == dvpad
+                if not cond:
+                    continue
+            elif receipt == 3:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= bias in ['no', 'alibi']
+                cond &= dpad == dvpad
+                cond &= deterministic == "f"
+                if not cond:
+                    continue
+            # PyTorch integration
+            elif receipt == 4:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= bias in ['no', 'bias']
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                cond &= dpad == dvpad
+                cond &= deterministic == "f"
+                if not cond:
                    continue
            # Aiter (mha_bwd) integration
-            if receipt == 300:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
-                    if not cond:
-                        continue
+            elif receipt == 300:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= mode == "batch"
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                if not cond:
+                    continue
            # Aiter (mha_varlen_bwd) integration
            elif receipt == 400:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
-                    if not cond:
-                        continue
+                cond = dtype in ['fp16', 'bf16']
+                cond &= mode == "group"
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                if not cond:
+                    continue
            # aiter::mha_bwd C++ api integration
            elif receipt == 600:
-                    cond = dtype in ['fp16', 'bf16']
-                    if not cond:
-                        continue
-            gen.append(k)
+                cond = dtype in ['fp16', 'bf16']
+                if not cond:
+                    continue
+            gen_dot_do_o[t.dot_do_o_kernel] = True
+            gen_dq_dk_dv[t.dq_dk_dv_kernel] = True
+            gen_convert_dq[t.convert_dq_kernel] = True
+            api_pool.register_dq_dk_dv_traits(t)

-    return gen
-
-def write_single_bwd_dq_dk_dv_kernel(kernel: FmhaBwdDQDKDVKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
-
-def write_single_bwd_dot_do_o_kernel(kernel: FmhaBwdOGradDotOKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
-
-def write_single_bwd_convert_dq_kernel(kernel: FmhaBwdConvertQGradKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
-
-def write_bwd_api(api_pool : FmhaBwdApiPool, autogen_dir: Path) -> None:
-    (autogen_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api)
+    return api_pool, list(gen_dot_do_o.keys()), list(gen_dq_dk_dv.keys()), list(gen_convert_dq.keys())

 def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
-    filter_list = filter_list.split('@')
-    filter_list.extend([''] * (3 - len(filter_list)))
-    # TODO
-    assert optdim_list == [-1]
+    api_pool, kernels_dot_do_o,  kernels_dq_dk_dv,  kernels_convert_dq = get_bwd_blobs(filter_list, receipt, mask_impl, optdim_list)
+    update_file(output_dir / FMHA_BWD_API_FILENAME, api_pool.api)
+    for k in kernels_dot_do_o:
+        update_file(output_dir / k.filename, k.template)
+    for k in kernels_convert_dq:
+        update_file(output_dir / k.filename, k.template)
+    for k in kernels_dq_dk_dv:
+        update_file(output_dir / k.filename, k.template)

-    kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
-    for kernel in kernels:
-        write_single_bwd_dot_do_o_kernel(kernel, output_dir)
-    kernels = get_bwd_convert_dq_blobs(filter_list[1], receipt)
-    for kernel in kernels:
-        write_single_bwd_convert_dq_kernel(kernel, output_dir)
-    api_pool, kernels = get_bwd_dq_dk_dv_blobs(filter_list[2], receipt, mask_impl)
-    for kernel in kernels:
-        write_single_bwd_dq_dk_dv_kernel(kernel, output_dir)
-    write_bwd_api(api_pool, output_dir)

-def list_blobs(file_path : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
-    filter_list = filter_list.split('@')
-    filter_list.extend([''] * (3 - len(filter_list)))
-    # TODO
-    assert optdim_list == [-1]
-
-    with file_path.open('a') as f:
-        kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
-        for kernel in kernels:
-            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        kernels = get_bwd_convert_dq_blobs(filter_list[1], receipt)
-        for kernel in kernels:
-            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        _, kernels = get_bwd_dq_dk_dv_blobs(filter_list[2], receipt, mask_impl)
-        for kernel in kernels:
-            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
+def list_blobs(file_path: Path, filter_list: str, receipt, optdim_list, mask_impl) -> None:
+    _, kernels_dot_do_o, kernels_dq_dk_dv, kernels_convert_dq = get_bwd_blobs(
+        filter_list, receipt, mask_impl, optdim_list
+    )
+    with file_path.open("a") as f:
+        for k in kernels_dot_do_o:
+            f.write(str(file_path.parent / GEN_DIR / k.filename) + "\n")
+        for k in kernels_dq_dk_dv:
+            f.write(str(file_path.parent / GEN_DIR / k.filename) + "\n")
+        for k in kernels_convert_dq:
+            f.write(str(file_path.parent / GEN_DIR / k.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_BWD_API_FILENAME) + "\n")
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -28,6 +28,7 @@ K0_MAX_SUBMAX_MAP = {
    64 : 64,
    96 : 128,
    128: 128,
+    192: 192,
    256: 256
 }

@@ -539,14 +540,14 @@ class KernelComponentFactory:
                (64, 64)  : [FmhaFwdTileSize(16, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
                             FmhaFwdTileSize(32, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
                             FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-            ### (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                (128,128) : [FmhaFwdTileSize(16, 32, 64, 128, 32,  128,  1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
                             FmhaFwdTileSize(32, 32, 128, 128, 32,  128,  1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
                             FmhaFwdTileSize(128, 64, 32, 128, 16,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
                             FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-            ### (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-            ### (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
            }
        elif dtype == 'fp8' or dtype == 'bf8':
@@ -570,23 +571,34 @@ class KernelComponentFactory:
        pipelines = []
        if dtype in ['fp16', 'bf16']:
            for logits, mask, bias, lse, dropout, skip, trload in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"], ["t", "f"]):
-                if bias == "bias":
-                    # TODO: rocm 6.2 compiler problem if using qr_async for bias case
+                if hdim == 256 and hdim_v == 256:
+                # if True:
                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, trload))
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, trload))
+                    # the below two is used for hdim vectorize load
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, trload))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, trload))
+
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
-                elif (hdim, hdim_v) in [(64, 64), (128, 128)] and logits == "f" and bias == "no" and dropout == "f" and lse == "f" and skip == "f" and trload == "t":
-                    pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, trload))
-                    pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
                else:
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
-                if receipt == 1 and bias != "bias":
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload)) # TODO: cover arbitraty hdim
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload)) # TODO: cover arbitraty hdim
+                    if bias == "bias":
+                        # TODO: rocm 6.2 compiler problem if using qr_async for bias case
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, trload))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, trload))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
+                    elif (hdim, hdim_v) in [(64, 64), (128, 128)] and logits == "f" and bias == "no" and dropout == "f" and lse == "f" and skip == "f" and trload == "t":
+                        pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, trload))
+                        pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
+                    else:
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload))
+                    if receipt == 1 and bias != "bias":
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, trload)) # TODO: cover arbitraty hdim
        elif dtype in ['fp8', 'bf8']:
            # no need lse/dropout kernels
            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
@@ -624,7 +636,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                    if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                        continue
-                if (hdim, hdim_v) == (192, 128) or hdim == 160:
+                if (hdim, hdim_v) == (192, 128):
                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                    if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                        continue
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -273,7 +273,7 @@ def get_fmha_fwd_appendkv_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
    else:
        return None

-def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaFwdAppendKVApiPool, List[FmhaFwdAppendKVKernel]]:
+def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, optdim_list) -> Tuple[FmhaFwdAppendKVApiPool, List[FmhaFwdAppendKVKernel]]:
    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
    #       support this in future
    def get_pipelines(dtype, hdim) -> List[FmhaFwdAppendKVPipeline]:
@@ -326,6 +326,9 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                if kernel_filter != '':
                    if not fnmatch.fnmatch(k.name, kernel_filter):
                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                # 2 - Flash attention integration
                if receipt == 2:
                    cond = dtype in ['fp16', 'bf16']
@@ -334,7 +337,7 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                        continue
                # PyTorch integration
                elif receipt == 4:
-                    cond = dtype in ['fp16, bf16']
+                    cond = dtype in ['fp16', 'bf16']
                    cond &= pipeline.F_vlayout == 'row'
                    if not cond:
                        continue
@@ -350,16 +353,14 @@ def write_fwd_appendkv_api(api_pool : FmhaFwdAppendKVApiPool, autogen_dir: Path)
    (autogen_dir / FMHA_FWD_APPENDKV_API_FILENAME).write_text(api_pool.api)

 def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
-    assert optdim_list == [-1]
-    api_pool, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
+    api_pool, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl, optdim_list)
    for kernel in kernels:
        write_single_kernel(kernel, output_dir)
    write_fwd_appendkv_api(api_pool, output_dir)

 def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
-    assert optdim_list == [-1]
    with file_path.open('a') as f:
-        _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
+        _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl, optdim_list)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_APPENDKV_API_FILENAME) + "\n")
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -41,7 +41,6 @@ K0_MAX_SUBMAX_MAP = {
 FMHA_FWD_SPLITKV_PIPELINE_MAP = {
    "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS",
    "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS",
-    "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync",
 }

 FMHA_FWD_SPLITKV_KERNEL_BODY="""
@@ -637,9 +636,9 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
        return {
            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-        ### '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-        ### '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
        }
    elif dtype == 'fp8' or dtype == 'bf8':
@@ -656,9 +655,9 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
        return {
            '32'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
            '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
-        ### '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
            '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
-        ### '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
            '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
    }
    elif dtype == 'fp8' or dtype == 'bf8':
@@ -670,7 +669,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
    else:
        return None

-def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaFwdSplitKVApiPool, List[FmhaFwdSplitKVKernel]]:
+def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, optdim_list) -> Tuple[FmhaFwdSplitKVApiPool, List[FmhaFwdSplitKVKernel]]:
    Pipeline = FmhaFwdSplitKVPipeline
    Kernel = FmhaFwdSplitKVKernel

@@ -685,28 +684,17 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
        pipelines = []
        if dtype in ['fp16', 'bf16']:
            for logits, mask, bias, pagedkv in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
-                # TODO: use async pipeline when compiler is more stable
-                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128, 160]:
-                # if True:
-                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))

-                    pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))

-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))

-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                else:
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    if receipt == 1:
-                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
        elif dtype in ['fp8', 'bf8']:
            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 't', squant, 'f', mask))
@@ -746,6 +734,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                if kernel_filter != '':
                    if not fnmatch.fnmatch(k.name, kernel_filter):
                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                # Flash attention integration
                if receipt == 2:
                    cond = dtype in ['fp16', 'bf16']
@@ -783,7 +774,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->

    return (api_pool, gen)

-def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaFwdSplitKVCombineKernel]:
+def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt, optdim_list) -> List[FmhaFwdSplitKVCombineKernel]:
    Pipeline = FmhaFwdSplitKVCombinePipeline
    Kernel = FmhaFwdSplitKVCombineKernel

@@ -830,6 +821,9 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis
                if kernel_filter != '':
                    if not fnmatch.fnmatch(k.name, kernel_filter):
                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                # Aiter(mha_varlen_fwd) integration
                if receipt == 200:
                    cond = dtype in ['fp16', 'bf16']
@@ -855,12 +849,11 @@ def write_fwd_splitkv_api(api_pool : FmhaFwdSplitKVApiPool, autogen_dir: Path) -
 def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
    filter_list = filter_list.split('@')
    filter_list.extend([''] * (2 - len(filter_list)))
-    assert optdim_list == [-1]

-    kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
+    kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt, optdim_list)
    for kernel in kernels:
        write_single_kernel(kernel, output_dir)
-    api_pool, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl)
+    api_pool, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl, optdim_list)
    for kernel in kernels:
        write_single_kernel(kernel, output_dir)
    write_fwd_splitkv_api(api_pool, output_dir)
@@ -868,13 +861,12 @@ def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask
 def list_blobs(file_path : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
    filter_list = filter_list.split('@')
    filter_list.extend([''] * (2 - len(filter_list)))
-    assert optdim_list == [-1]

    with file_path.open('a') as f:
-        kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
+        kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt, optdim_list)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        _, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl)
+        _, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl, optdim_list)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME) + "\n")
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -357,31 +357,25 @@ auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args)
 template <ck_tile::index_t HDim_,
          typename DataType_,
          bool kIsGroupMode_,
-          ck_tile::BlockFmhaBwdPipelineEnum FmhaBwdPipelineEnum_,
          typename FmhaMask_,
          typename FmhaDropout_,
          ck_tile::BlockAttentionBiasEnum BiasEnum_,
          bool kHasBiasGrad_,
-          bool kPadS_,
-          bool kPadSK_,
          bool kPadD_,
          bool kPadDv_,
          bool kIsDeterministic_>
 struct fmha_bwd_dq_dk_dv_traits_
 {
-    static constexpr ck_tile::index_t HDim    = HDim_;
-    using DataType                            = ck_tile::remove_cvref_t<DataType_>;
-    static constexpr bool kIsGroupMode        = kIsGroupMode_;
-    static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_;
-    using FmhaMask                            = ck_tile::remove_cvref_t<FmhaMask_>;
-    using FmhaDropout                         = ck_tile::remove_cvref_t<FmhaDropout_>;
-    static constexpr auto BiasEnum            = BiasEnum_;
-    static constexpr bool kHasBiasGrad        = kHasBiasGrad_;
-    static constexpr bool kPadS               = kPadS_;
-    static constexpr bool kPadSK              = kPadSK_;
-    static constexpr bool kPadD               = kPadD_;
-    static constexpr bool kPadDv              = kPadDv_;
-    static constexpr bool kIsDeterministic    = kIsDeterministic_;
+    static constexpr ck_tile::index_t HDim = HDim_;
+    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode     = kIsGroupMode_;
+    using FmhaMask                         = ck_tile::remove_cvref_t<FmhaMask_>;
+    using FmhaDropout                      = ck_tile::remove_cvref_t<FmhaDropout_>;
+    static constexpr auto BiasEnum         = BiasEnum_;
+    static constexpr bool kHasBiasGrad     = kHasBiasGrad_;
+    static constexpr bool kPadD            = kPadD_;
+    static constexpr bool kPadDv           = kPadDv_;
+    static constexpr bool kIsDeterministic = kIsDeterministic_;
 };

 template <typename Traits_>
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -126,9 +126,6 @@ if __name__ == "__main__":
    filter_list.extend([''] * (len(api_list) - len(filter_list)))
    optdim_list = [int(hdim) for hdim in args.optdim.split(',')]

-    if len(api_list) > 1:
-        assert optdim_list == [-1]
-
    if args.list_blobs is not None:
        list_blobs(args.list_blobs, api_list, filter_list, optdim_list, int(args.receipt), mask_impl=args.mask)
    else:
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -191,8 +191,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        return base_str;
    }();

-    std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+    std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
              << ", yr_stride:" << yr_stride << std::flush;

--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -2,9 +2,15 @@ add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
 add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp)
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()
 list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0")
 target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -24,7 +24,7 @@ template <typename GemmConfig,
          typename CLayout,
          bool Persistent,
          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)

 {
    if constexpr(Persistent)
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -14,12 +14,13 @@
 #define CK_TILE_PIPELINE_MEMORY 2
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 #define CK_TILE_PIPELINE_COMPUTE_V5 4
-#define CK_TILE_PIPELINE_PRESHUFFLE 5
+#define CK_TILE_PIPELINE_PRESHUFFLE_V1 5
+#define CK_TILE_PIPELINE_PRESHUFFLE_V2 6

 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile()
 {
-#if defined(__gfx950__)
+#if defined(CK_GFX950_SUPPORT)
    constexpr bool is_8bit_float =
        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
    if constexpr(M_Warp_Tile == 32)
@@ -36,7 +37,7 @@ constexpr ck_tile::index_t get_k_warp_tile()
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile_flatmm()
 {
-#if defined(__gfx950__)
+#if defined(CK_GFX950_SUPPORT)
    if constexpr(M_Warp_Tile == 32)
        return sizeof(PrecType) == 2 ? 16 : 64;
    else
@@ -114,16 +115,16 @@ template <typename PrecType>
 struct GemmConfigComputeV3 : public GemmConfigBase
 {
    // Compute V3 only support Intrawave scheduler
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);

-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
    static constexpr ck_tile::index_t K_Warp = 1;

-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();

    static constexpr bool DoubleSmemBuffer     = false;
@@ -231,7 +232,7 @@ struct GemmConfigComputeV5 : public GemmConfigBase
 };

 template <typename PrecType>
-struct GemmConfigPreshufle_1 : public GemmConfigBase
+struct GemmConfigPreshuffle_1 : public GemmConfigBase
 {
    static constexpr ck_tile::index_t M_Tile = 128;
    static constexpr ck_tile::index_t N_Tile = 128;
@@ -247,13 +248,13 @@ struct GemmConfigPreshufle_1 : public GemmConfigBase

    static constexpr int kBlockPerCu           = 2;
    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V1;
    static constexpr bool Preshuffle           = true;
    static constexpr bool DoubleSmemBuffer     = false;
 };

 template <typename PrecType>
-struct GemmConfigPreshufle_2 : public GemmConfigBase
+struct GemmConfigPreshuffle_2 : public GemmConfigBase
 {
    static constexpr ck_tile::index_t M_Tile = 128;
    static constexpr ck_tile::index_t N_Tile = 128;
@@ -263,15 +264,15 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase
    static constexpr ck_tile::index_t N_Warp = 4;
    static constexpr ck_tile::index_t K_Warp = 1;

-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();

    static constexpr int kBlockPerCu           = 2;
    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
    static constexpr bool Preshuffle           = true;
-    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr bool DoubleSmemBuffer     = true;
 };

 template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
@@ -429,7 +430,7 @@ struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
 };

 template <>
-struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE_V1>
 {
    template <typename PipelineProblem>
    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
@@ -438,6 +439,16 @@ struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE>
        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
 };

+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE_V2>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline =
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+};
+
 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
@@ -457,7 +468,9 @@ auto create_args(int argc, char* argv[])
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert("split_k", "1", "splitK value")
        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
-        .insert("persistent", "0", "0:non-persistent, 1:persistent");
+        .insert("persistent", "0", "0:non-persistent, 1:persistent")
+        .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
+        .insert("rotating_count", "1", "rotating count, defaults to 1");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -475,4 +488,4 @@ template <typename ADataType,
          typename CLayout,
          bool Persistent = false,
          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s);
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -25,7 +25,7 @@ template <typename GemmConfig,
          typename ELayout,
          bool Persistent,
          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)

 {
    using GemmShape = ck_tile::TileGemmShape<
@@ -74,119 +74,116 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
    float ave_time{0};

-    const auto Run =
-        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GemmConfig::Scheduler;
-            constexpr auto memory_operation = memory_operation_.value;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;

-            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                               BDataType,
-                                                                               AccDataType,
-                                                                               GemmShape,
-                                                                               GemmUniversalTraits,
-                                                                               scheduler,
-                                                                               has_hot_loop_v,
-                                                                               tail_number_v>;
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;

-            using GemmPipeline = typename PipelineTypeTraits<
-                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;

-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                 BDataType,
-                                                 DsDataType,
-                                                 AccDataType,
-                                                 CDataType,
-                                                 DsLayout,
-                                                 ELayout,
-                                                 CDEElementWise,
-                                                 UniversalGemmProblem::kBlockSize,
-                                                 TilePartitioner::MPerBlock,
-                                                 TilePartitioner::NPerBlock,
-                                                 GemmConfig::M_Warp,
-                                                 GemmConfig::N_Warp,
-                                                 GemmConfig::M_Warp_Tile,
-                                                 GemmConfig::N_Warp_Tile,
-                                                 GemmConfig::K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC,
-                                                 memory_operation,
-                                                 GemmConfig::NumWaveGroups>>;
-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKernelArgs(args);
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             UniversalGemmProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);

-            dim3 grids;
-            if constexpr(Persistent)
-            {
-                grids = Kernel::MaxOccupancyGridSize(s);
-            }
-            else
-            {
-                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-            }
-            constexpr dim3 blocks = Kernel::BlockSize();
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        constexpr dim3 blocks = Kernel::BlockSize();

-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-            }
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }

-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                          << "shape: " << GemmShape::GetName() << '\n'
-                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                          << "pipeline: " << GemmPipeline::GetName() << '\n'
-                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
-            }
-            if(s.flush_cache_)
-            {
-                std::cout << "Flushing cache..." << std::endl;
-                static constexpr ck_tile::index_t APackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                static constexpr ck_tile::index_t BPackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;

-                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));

-                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();

-                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
-                rotating_mem.Print();
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();

-                auto run_flush_cache = [&]() {
-                    // flush icache
-                    ck_tile::flush_icache();
-                    // rotating mem
-                    rotating_mem.Next();
-                    // clear c mem
-                    if(args.k_batch > 1)
-                        hipGetErrorString(hipMemsetAsync(
-                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-                };
-                ave_time = ck_tile::launch_kernel_preprocess(
-                    s,
-                    run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                        Kernel{}, grids, blocks, 0, kargs));
-            }
-            else
-            {
-                ave_time =
-                    ck_tile::launch_kernel(s,
-                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                               Kernel{}, grids, blocks, 0, kargs));
-            }
-            return ave_time;
-        };
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };

    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
        if(args.k_batch == 1)
@@ -282,13 +279,11 @@ int main(int argc, char* argv[])
 {
    try
    {
-        return !run_gemm_example<GemmConfigPreshufle_1>(argc, argv);
+        return !run_gemm_example<GemmConfigPreshuffle_2>(argc, argv);
    }
    catch(const std::runtime_error& e)
    {
        std::cerr << "Caught runtime error: " << e.what() << '\n';
-        // Return a non-zero code to indicate failure
        return EXIT_FAILURE;
    }
-    return EXIT_SUCCESS;
 }
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -158,7 +158,7 @@ template <typename GemmConfig,
          typename CLayout,
          bool Persistent,
          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float gemm(const ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& s);
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);

 template <typename GemmConfig,
          typename ADataType,
@@ -183,20 +183,20 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                  ck_tile::index_t kbatch,
                  int n_warmup,
                  int n_repeat,
-                  bool persistent)
+                  bool persistent,
+                  bool flush_cache,
+                  int rotating_count)
 {
-    ck_tile::GemmHostArgs</*NumDTensor = 0*/> args = {a_m_k_dev_buf.GetDeviceBuffer(),
-                                                      b_k_n_dev_buf.GetDeviceBuffer(),
-                                                      {},
-                                                      c_m_n_dev_buf.GetDeviceBuffer(),
-                                                      kbatch,
-                                                      M,
-                                                      N,
-                                                      K,
-                                                      stride_A,
-                                                      stride_B,
-                                                      {},
-                                                      stride_C};
+    ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
+                                  b_k_n_dev_buf.GetDeviceBuffer(),
+                                  c_m_n_dev_buf.GetDeviceBuffer(),
+                                  kbatch,
+                                  M,
+                                  N,
+                                  K,
+                                  stride_A,
+                                  stride_B,
+                                  stride_C};

    float ave_time;
    if(persistent)
@@ -213,7 +213,9 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                        CLayout,
                        true,
                        CDEElementWise>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
    }
    else
    {
@@ -229,7 +231,9 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                        CLayout,
                        false,
                        CDEElementWise>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
    }

    std::size_t flop = std::size_t(2) * M * N * K;
@@ -238,15 +242,16 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
    float gb_per_sec = num_byte / 1.E6 / ave_time;

-    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
+    std::cout << "Run Gemm kernel with \n M=" << M << " N=" << N << " K=" << K
              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
              << " B_Type=" << DataTypeTraits<BDataType>::name
              << " C_Type=" << DataTypeTraits<CDataType>::name
              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
-              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+              << " Persistent=" << (persistent ? "on" : "off") << " : \n"
+              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;

    return ave_time;
 }
@@ -299,6 +304,8 @@ int run_gemm_example_with_layouts(int argc,
    int n_repeat                 = arg_parser.get_int("repeat");
    ck_tile::index_t init_method = arg_parser.get_int("init");
    bool persistent              = arg_parser.get_int("persistent");
+    bool flush_cache             = arg_parser.get_bool("flush_cache");
+    int rotating_count           = arg_parser.get_int("rotating_count");

    const bool preshuffle = GemmConfig::Preshuffle;

@@ -416,7 +423,9 @@ int run_gemm_example_with_layouts(int argc,
                         kbatch,
                         n_warmup,
                         n_repeat,
-                         persistent);
+                         persistent,
+                         flush_cache,
+                         rotating_count);

    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
    bool pass = true;
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -25,7 +25,7 @@ template <typename GemmConfig,
          typename ELayout,
          bool Persistent,
          typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)

 {
    using GemmShape = ck_tile::TileGemmShape<
@@ -74,120 +74,117 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
    float ave_time{0};

-    const auto Run =
-        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GemmConfig::Scheduler;
-            constexpr auto memory_operation = memory_operation_.value;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;

-            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                               BDataType,
-                                                                               AccDataType,
-                                                                               GemmShape,
-                                                                               GemmUniversalTraits,
-                                                                               scheduler,
-                                                                               has_hot_loop_v,
-                                                                               tail_number_v>;
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;

-            using GemmPipeline = typename PipelineTypeTraits<
-                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;

-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                 BDataType,
-                                                 DsDataType,
-                                                 AccDataType,
-                                                 CDataType,
-                                                 DsLayout,
-                                                 ELayout,
-                                                 CDEElementWise,
-                                                 UniversalGemmProblem::kBlockSize,
-                                                 TilePartitioner::MPerBlock,
-                                                 TilePartitioner::NPerBlock,
-                                                 GemmConfig::M_Warp,
-                                                 GemmConfig::N_Warp,
-                                                 GemmConfig::M_Warp_Tile,
-                                                 GemmConfig::N_Warp_Tile,
-                                                 GemmConfig::K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC,
-                                                 memory_operation,
-                                                 GemmConfig::NumWaveGroups>>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             UniversalGemmProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;

-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKernelArgs(args);
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);

-            dim3 grids;
-            if constexpr(Persistent)
-            {
-                grids = Kernel::MaxOccupancyGridSize(s);
-            }
-            else
-            {
-                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-            }
-            constexpr dim3 blocks = Kernel::BlockSize();
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        constexpr dim3 blocks = Kernel::BlockSize();

-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-            }
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }

-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                          << "shape: " << GemmShape::GetName() << '\n'
-                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                          << "pipeline: " << GemmPipeline::GetName() << '\n'
-                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
-            }
-            if(s.flush_cache_)
-            {
-                std::cout << "Flushing cache..." << std::endl;
-                static constexpr ck_tile::index_t APackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                static constexpr ck_tile::index_t BPackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;

-                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));

-                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();

-                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
-                rotating_mem.Print();
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();

-                auto run_flush_cache = [&]() {
-                    // flush icache
-                    ck_tile::flush_icache();
-                    // rotating mem
-                    rotating_mem.Next();
-                    // clear c mem
-                    if(args.k_batch > 1)
-                        hipGetErrorString(hipMemsetAsync(
-                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-                };
-                ave_time = ck_tile::launch_kernel_preprocess(
-                    s,
-                    run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                        Kernel{}, grids, blocks, 0, kargs));
-            }
-            else
-            {
-                ave_time =
-                    ck_tile::launch_kernel(s,
-                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                               Kernel{}, grids, blocks, 0, kargs));
-            }
-            return ave_time;
-        };
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };

    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
        if(args.k_batch == 1)
--- a/example/ck_tile/04_img2col/image_to_column.cpp
+++ b/example/ck_tile/04_img2col/image_to_column.cpp
@@ -149,9 +149,17 @@ int main(int argc, char* argv[])
    float ave_time =
        image_to_column(traits, args, ck_tile::stream_config{nullptr, config.time_kernel});

-    std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType));
-    float gb_per_sec      = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    if(config.time_kernel)
+    {
+        std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType));
+        float gb_per_sec      = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    }
+    else
+    {
+        std::cout << "image_to_column: pass, No Perf generated due to config.time_kernel=0"
+                  << std::endl;
+    }

    bool pass = true;

--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -1,16 +1,21 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include "ck_tile/host.hpp"
-#include "reduce.hpp"
+#include "ck_tile/ops/reduce.hpp"
 #include <cstring>

 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3328", "m dimension")
-        .insert("n", "4096", "n dimension")
+    arg_parser.insert("n", "32", "n dimension")
+        .insert("h", "7", "h dimension")
+        .insert("w", "7", "w dimension")
+        .insert("c", "512", "c dimension")
        .insert("v", "1", "cpu validation or not")
        .insert("prec", "fp16", "precision")
-        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("warmup", "0", "cold iter")
+        .insert("repeat", "1", "hot iter");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -23,15 +28,28 @@ bool run(const ck_tile::ArgParser& arg_parser)
    using ComputeDataType = float;
    using YDataType       = DataType;

-    ck_tile::index_t m = arg_parser.get_int("m");
-    ck_tile::index_t n = arg_parser.get_int("n");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t H = arg_parser.get_int("h");
+    ck_tile::index_t W = arg_parser.get_int("w");
+    ck_tile::index_t C = arg_parser.get_int("c");
    int do_validation  = arg_parser.get_int("v");
    int warmup         = arg_parser.get_int("warmup");
    int repeat         = arg_parser.get_int("repeat");

-    ck_tile::HostTensor<XDataType> x_host({m, n});
-    ck_tile::HostTensor<YDataType> y_host_ref({m});
-    ck_tile::HostTensor<YDataType> y_host_dev({m});
+    std::vector<ck_tile::index_t> problem_shape = {N, H, W, C};
+    std::vector<ck_tile::index_t> strides(4);
+    strides[0] = H * W * C;
+    strides[1] = W * C;
+    strides[2] = C;
+    strides[3] = 1;
+
+    // Define reduction specification:
+    constexpr auto kept_dim    = ck_tile::sequence<0, 3>{}; // Which dimension to keep
+    constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce
+
+    ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
+    ck_tile::HostTensor<YDataType> y_host_ref({N, C}, {C, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({N, C}, {C, 1});

    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);

@@ -54,7 +72,9 @@ bool run(const ck_tile::ArgParser& arg_parser)

    constexpr ck_tile::index_t kBlockSize  = 256;
    constexpr ck_tile::index_t kBlockPerCu = 1;
-    ck_tile::index_t kGridSize             = (m / BlockTile::at(ck_tile::number<0>{}));
+    ck_tile::index_t kept_dim_len_prod     = N * C;
+    ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
+                                 BlockTile::at(ck_tile::number<0>{});
    std::cout << "grid size " << kGridSize << std::endl;

    using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, Vector>;
@@ -63,6 +83,17 @@ bool run(const ck_tile::ArgParser& arg_parser)

    using Kernel = ck_tile::Reduce<Porblem>;

+    // Create input tensor shape and strides
+    auto input_shape =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);
+
+    if(!Kernel::IsSupportedArgument(
+           C, input_strides)) // output tensor's continuous dimension and input strides
+    {
+        throw std::runtime_error("Wrong! Arguments not supported!\n");
+    }
+
    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
                                       Kernel{},
@@ -71,10 +102,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                       0,
                                       static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
-                                       m,
-                                       n));
+                                       input_shape,
+                                       input_strides,
+                                       kept_dim,
+                                       reduce_dims));

-    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
+    std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;

    float gb_per_sec = num_btype / 1.E6 / ave_time;

@@ -86,7 +119,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    {
        // reference
        ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
-            x_host, y_host_ref, ReduceOp{});
+            x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims);
        y_buf.FromDevice(y_host_dev.mData.data());
        pass = ck_tile::check_err(y_host_dev, y_host_ref);

--- a/example/ck_tile/05_reduce/reduce.hpp
+++ b/example/ck_tile/05_reduce/reduce.hpp
@@ -1,164 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
-
-namespace ck_tile {
-
-template <typename BlockWarps, // num warps along seq<M, N>
-          typename BlockTile,  // block size, seq<M, N>
-          typename WarpTile,   // warp size, seq<M, N>
-          typename Vector>     // contiguous pixels(vector size) along seq<M, N>
-struct Reduce2dShape
-{
-    static constexpr index_t Block_M = BlockTile::at(number<0>{});
-    static constexpr index_t Block_N = BlockTile::at(number<1>{});
-
-    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
-    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
-
-    static constexpr index_t Vector_M = Vector::at(number<0>{});
-    static constexpr index_t Vector_N = Vector::at(number<1>{});
-
-    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
-    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
-
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
-
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
-
-    static constexpr index_t BlockSize =
-        ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
-};
-
-template <typename XDataType_,
-          typename ComputeDataType_,
-          typename YDataType_,
-          typename BlockShape_,
-          typename ReduceOp_>
-struct Reduce2dProblem
-{
-    using XDataType       = remove_cvref_t<XDataType_>;
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
-    using YDataType       = remove_cvref_t<YDataType_>;
-    using BlockShape      = remove_cvref_t<BlockShape_>;
-    using ReduceOp        = ReduceOp_;
-
-    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
-    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
-};
-
-template <typename Problem_, typename Policy_ = BlockReduce2dDefaultPolicy>
-struct Reduce
-{
-    using Problem = ck_tile::remove_cvref_t<Problem_>;
-    using Policy  = ck_tile::remove_cvref_t<Policy_>;
-
-    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
-    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
-    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
-
-#if 0
-    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N)
-    const
-    {
-        using S = typename Problem::BlockShape;
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
-
-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(M), number<1>{});
-
-        const auto iM = get_block_id() * S::Block_M;
-
-        auto x_window = make_tile_window(x_m_n,
-                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                                         {iM, 0},
-                                         Policy::template MakeXBlockTileDistribution<Problem>());
-
-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
-
-        const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
-
-        const XDataType reduce_init_value = 0;
-
-        constexpr auto reduce_dims = sequence<1>{};
-
-        auto y_compute = decltype(block_tile_reduce<ComputeDataType>(
-            load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){};
-
-        set_tile(y_compute, reduce_init_value);
-
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x = load_tile(x_window);
-            block_tile_reduce(y_compute, x, reduce_dims, f_reduce);
-            move_tile_window(x_window, {0, S::Block_N});
-        }
-
-        block_tile_reduce_sync(y_compute, f_reduce);
-
-        store_tile(y_window, cast_tile<YDataType>(y_compute));
-    }
-#else
-    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const
-    {
-        using S = typename Problem::BlockShape;
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
-
-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(M), number<1>{});
-
-        const auto iM = get_block_id() * S::Block_M;
-
-        auto x_window = make_tile_window(x_m_n,
-                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                                         {iM, 0},
-                                         Policy::template MakeXBlockTileDistribution<Problem>());
-
-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
-
-        __shared__ char smem[Policy::template GetSmemSize<Problem>()];
-
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-
-        auto reduce_func         = typename Problem::ReduceOp{};
-        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
-        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
-        auto block_reduce2d_cross_warp_sync =
-            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
-
-        using XTensorType = decltype(load_tile(x_window));
-        auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
-        set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x = load_tile(x_window);
-            block_reduce2d(x, y_compute, reduce_func);
-            move_tile_window(x_window, {0, S::Block_N});
-        }
-
-        block_reduce2d_sync(y_compute, reduce_func);
-        block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func);
-
-        store_tile(y_window, cast_tile<YDataType>(y_compute));
-    }
-#endif
-};
-
-} // namespace ck_tile
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -333,12 +333,12 @@ struct matrix_core_swizzle_kernel
                    return tmp_1;
 #else
                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv,
-                    constexpr index_t kv = Alignment;
-                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
-                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t kv          = Alignment;
+                    constexpr index_t nw          = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw          = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
                    constexpr index_t waveflatten = kw * nw * kv;
-                    const index_t kr = a_.k / (k1 * k2);
-                    const index_t nr = a_.n / nw;
+                    const index_t kr              = a_.k / (k1 * k2);
+                    const index_t nr              = a_.n / nw;
                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
                        p_dst,
                        make_tuple(nr, kr, waveflatten),
@@ -387,8 +387,8 @@ struct matrix_core_swizzle_kernel
                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
                    constexpr index_t waveflatten_tile = kw * nw * kv;
-                    constexpr index_t nr_tile = NPerBlock / nw;
-                    constexpr index_t kr_tile = KPerBlock / (kw * kv);
+                    constexpr index_t nr_tile          = NPerBlock / nw;
+                    constexpr index_t kr_tile          = KPerBlock / (kw * kv);
                    return make_tile_window(dst_view,
                                            make_tuple(number<nr_tile>{},
                                                       number<kr_tile>{},
--- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
@@ -183,8 +183,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            }
        }

-        std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", stride:" << stride
+        std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", stride:" << stride
                  << ", s:" << USEModelSensitive << ", valid:" << (pass ? "y" : "n") << std::flush
                  << std::endl;
    }
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
@@ -193,8 +193,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        return base_str;
    }();

-    std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+    std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
              << ", yr_stride:" << yr_stride << ", s:" << use_model_sensitive_rmsnorm << std::flush;

--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
@@ -105,8 +105,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    b_buf.ToDevice(b_host.data());
    gamma_buf.ToDevice(gamma_host.data());

-    std::cout << "[" << input_data_type << ", " << quantized_data_type << "]"
-              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+    std::cout << "[" << input_data_type << ", " << quantized_data_type << "]" << " m:" << m
+              << ", n:" << n << ", stride:" << stride << std::flush;

    add_rmsnorm2d_rdquant_fwd_traits traits{input_data_type, quantized_data_type, SaveX};

--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -256,8 +256,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            }
        }

-        std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", stride:" << stride
+        std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", stride:" << stride
                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
    }

--- a/example/ck_tile/12_smoothquant/example_smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
@@ -216,10 +216,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
            }
        }

-        std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
-                  << ", y_stride:" << y_stride << ", valid:" << (pass ? "y" : "n") << std::flush
-                  << std::endl;
+        std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n
+                  << ", x_stride:" << x_stride << ", y_stride:" << y_stride
+                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
    }

    return pass;
--- a/example/ck_tile/12_smoothquant/smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.cpp
@@ -93,9 +93,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    x_buf.ToDevice(x_host.data());
    smscale_buf.ToDevice(smscale_host.data());

-    std::cout << "[" << data_type << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride << ", y_stride:" << y_stride
-              << std::flush;
+    std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", y_stride:" << y_stride << std::flush;

    smoothquant_traits traits{data_type};

--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -228,20 +228,26 @@ bool test_moe_sorting(ck_tile::ArgParser args)
    moe_sorting_trait trait{
        index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};

-    moe_sorting_args karg
-    {
-        topk_ids_dev.GetDeviceBuffer(), weights_dev.GetDeviceBuffer(),
-            local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer() : nullptr,
-            is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
-            sorted_ids_dev.GetDeviceBuffer(), sorted_weights_dev.GetDeviceBuffer(),
-            sorted_expert_ids_dev.GetDeviceBuffer(), sorted_id_cnt_dev.GetDeviceBuffer(),
-            moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
-            workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr, tokens, unit_size,
-            num_experts, topk,
+    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
+                          weights_dev.GetDeviceBuffer(),
+                          local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
+                                               : nullptr,
+                          is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
+                          sorted_ids_dev.GetDeviceBuffer(),
+                          sorted_weights_dev.GetDeviceBuffer(),
+                          sorted_expert_ids_dev.GetDeviceBuffer(),
+                          sorted_id_cnt_dev.GetDeviceBuffer(),
+                          moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                          workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
+                          tokens,
+                          unit_size,
+                          num_experts,
+                          topk,
 #if MOE_SORTING_FMOE_2D_BUF
-            moe_buf_interm_dim, moe_buf_elem_bytes
+                          moe_buf_interm_dim,
+                          moe_buf_elem_bytes
 #else
-            static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
+                          static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
 #endif
    };

--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -40,11 +40,11 @@
    constexpr bool local_expert_masking       = local_expert_masking_;                                  \
    constexpr bool local_token                = local_token_;                                           \
    using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
-                                                    ms_weight_type,            \
-                                                    sub_token_tile,            \
-                                                    sub_token_onshot,          \
-                                                    local_expert_masking,      \
-                                                    local_token>;              \
+                                                                             ms_weight_type,            \
+                                                                             sub_token_tile,            \
+                                                                             sub_token_onshot,          \
+                                                                             local_expert_masking,      \
+                                                                             local_token>;              \
    using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
    auto kargs                                = kernel::MakeKargs(a);                                   \
    const dim3 grids                          = kernel::GridSize(a);                                    \
@@ -200,11 +200,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
        constexpr bool expert_masking         = expert_masking_;                                    \
        constexpr bool local_token            = local_token_;                                       \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                               \
        const dim3 grids                      = kernel::GridSize(a);                                \
@@ -218,11 +218,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
        constexpr bool expert_masking         = expert_masking_;                                    \
        constexpr bool local_token            = local_token_;                                       \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                               \
        const dim3 grids                      = kernel::GridSize(a);                                \
@@ -236,11 +236,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
        constexpr bool expert_masking         = expert_masking_;                                    \
        constexpr bool local_token            = local_token_;                                       \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                               \
        const dim3 grids                      = kernel::GridSize(a);                                \
@@ -254,11 +254,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
        constexpr bool expert_masking         = expert_masking_;                                    \
        constexpr bool local_token            = local_token_;                                       \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                               \
        const dim3 grids                      = kernel::GridSize(a);                                \
@@ -273,11 +273,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
        constexpr bool expert_masking         = expert_masking_;                                     \
        constexpr bool local_token            = local_token_;                                        \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
-                                                        ms_weight_type,         \
-                                                        mesh_type_,             \
-                                                        unroll_num,             \
-                                                        expert_masking,         \
-                                                        local_token>;           \
+                                                                             ms_weight_type,         \
+                                                                             mesh_type_,             \
+                                                                             unroll_num,             \
+                                                                             expert_masking,         \
+                                                                             local_token>;           \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                                \
        const dim3 grids                      = kernel::GridSize(a);                                 \
--- a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
@@ -124,9 +124,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
    smscale_buf.ToDevice(smscale_host.data());
    topk_ids_buf.ToDevice(topk_ids_host.data());

-    std::cout << "[" << prec_i << "-" << prec_o << "]"
-              << " tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride
-              << ", experts:" << experts << ", topk:" << topk << std::flush;
+    std::cout << "[" << prec_i << "-" << prec_o << "]" << " tokens:" << tokens
+              << ", hidden_size:" << hidden_size << ", stride:" << stride << ", experts:" << experts
+              << ", topk:" << topk << std::flush;

    moe_smoothquant_traits traits{prec_i, prec_o};

--- a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -25,27 +25,27 @@ float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_conf
    }();

    auto t0 = fused_moesorting_trait{"int32", "fp32", t.local_expert_masking};
-    auto a0 = fused_moesorting_args
-    {
-        a.topk_ids_ptr,              // const void* p_topk_ids;
-            a.topk_weight_ptr,       // const void* p_weights;
-            a.local_expert_mask_ptr, // const void* p_local_expert_mask;
-            a.local_tokens,
-            a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
-            a.sorted_weight_ptr,     // void* p_sorted_weights;
-            a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
-            a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
-            a.o_ptr,                 // void* p_moe_buf;
-            a.ws_ptr,                // void* p_ws;
-            a.num_tokens,            // index_t tokens;
-            a.block_m,               // index_t unit_size;
-            a.num_experts,           // index_t num_experts;
-            a.topk,                  // index_t topk;
+    auto a0 = fused_moesorting_args{
+        a.topk_ids_ptr,          // const void* p_topk_ids;
+        a.topk_weight_ptr,       // const void* p_weights;
+        a.local_expert_mask_ptr, // const void* p_local_expert_mask;
+        a.local_tokens,
+        a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
+        a.sorted_weight_ptr,     // void* p_sorted_weights;
+        a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
+        a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
+        a.o_ptr,                 // void* p_moe_buf;
+        a.ws_ptr,                // void* p_ws;
+        a.num_tokens,            // index_t tokens;
+        a.block_m,               // index_t unit_size;
+        a.num_experts,           // index_t num_experts;
+        a.topk,                  // index_t topk;
 #if MOE_SORTING_FMOE_2D_BUF
-            a.stride_token, o_data_bytes,
+        a.stride_token,
+        o_data_bytes,
 #else
-            static_cast<ck_tile::long_index_t>(a.num_tokens) *
-                a.stride_token* o_data_bytes // index_t moe_buf_bytes;
+        static_cast<ck_tile::long_index_t>(a.num_tokens) * a.stride_token *
+            o_data_bytes // index_t moe_buf_bytes;
 #endif
    };

--- a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
@@ -16,11 +16,11 @@ float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a)
 {
    using f_traits = ck_tile::FusedMoeGemmTraits<Ts_::GateOnly, Ts_::FusedQuant == 1, 1 /*atomic*/>;
    using f_shape  = ck_tile::FusedMoeGemmShape<typename Ts_::BlockTile_0,
-                                               typename Ts_::WarpPerBlock_0,
-                                               typename Ts_::WarpTile_0,
-                                               typename Ts_::BlockTile_1,
-                                               typename Ts_::WarpPerBlock_0,
-                                               typename Ts_::WarpTile_0>;
+                                                typename Ts_::WarpPerBlock_0,
+                                                typename Ts_::WarpTile_0,
+                                                typename Ts_::BlockTile_1,
+                                                typename Ts_::WarpPerBlock_0,
+                                                typename Ts_::WarpTile_0>;

    constexpr auto get_activation_ = []() {
        if constexpr(Ts_::Activation == 0)
--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -40,11 +40,11 @@
    constexpr bool local_expert_masking       = local_expert_masking_;                                  \
    constexpr bool local_token                = local_token_;                                           \
    using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
-                                                    ms_weight_type,            \
-                                                    sub_token_tile,            \
-                                                    sub_token_onshot,          \
-                                                    local_expert_masking,      \
-                                                    local_token>;              \
+                                                                             ms_weight_type,            \
+                                                                             sub_token_tile,            \
+                                                                             sub_token_onshot,          \
+                                                                             local_expert_masking,      \
+                                                                             local_token>;              \
    using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
    auto kargs                                = kernel::MakeKargs(a);                                   \
    const dim3 grids                          = kernel::GridSize(a);                                    \
@@ -204,11 +204,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
        constexpr bool expert_masking         = expert_masking_;                                    \
        constexpr bool local_token            = local_token_;                                       \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                               \
        const dim3 grids                      = kernel::GridSize(a);                                \
@@ -222,11 +222,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
        constexpr bool expert_masking         = expert_masking_;                                    \
        constexpr bool local_token            = local_token_;                                       \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                               \
        const dim3 grids                      = kernel::GridSize(a);                                \
@@ -240,11 +240,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
        constexpr bool expert_masking         = expert_masking_;                                    \
        constexpr bool local_token            = local_token_;                                       \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                               \
        const dim3 grids                      = kernel::GridSize(a);                                \
@@ -258,11 +258,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
        constexpr bool expert_masking         = expert_masking_;                                    \
        constexpr bool local_token            = local_token_;                                       \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                               \
        const dim3 grids                      = kernel::GridSize(a);                                \
@@ -277,11 +277,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
        constexpr bool expert_masking         = expert_masking_;                                     \
        constexpr bool local_token            = local_token_;                                        \
        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
-                                                        ms_weight_type,         \
-                                                        mesh_type_,             \
-                                                        unroll_num,             \
-                                                        expert_masking,         \
-                                                        local_token>;           \
+                                                                             ms_weight_type,         \
+                                                                             mesh_type_,             \
+                                                                             unroll_num,             \
+                                                                             expert_masking,         \
+                                                                             local_token>;           \
        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
        auto kargs                            = kernel::MakeKargs(a);                                \
        const dim3 grids                      = kernel::GridSize(a);                                 \
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -218,8 +218,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            return std::string(", st:") + std::to_string(stride);
    }();

-    std::cout << "[" << api_str << "|" << prec_str << "]"
-              << " t:" << tokens;
+    std::cout << "[" << api_str << "|" << prec_str << "]" << " t:" << tokens;

    if(is_local_token)
    {
--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -50,21 +50,20 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                          int n_warmup,
                          int n_repeat)
 {
-    ck_tile::BatchedGemmHostArgs args;
-    args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
-    args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
-    args.e_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
-    args.k_batch        = kbatch;
-    args.M              = M;
-    args.N              = N;
-    args.K              = K;
-    args.stride_A       = stride_A;
-    args.stride_B       = stride_B;
-    args.stride_E       = stride_C;
-    args.batch_stride_A = batch_stride_A;
-    args.batch_stride_B = batch_stride_B;
-    args.batch_stride_E = batch_stride_C;
-    args.batch_count    = batch_count;
+    ck_tile::BatchedGemmHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
+                                      b_k_n_dev_buf.GetDeviceBuffer(),
+                                      c_m_n_dev_buf.GetDeviceBuffer(),
+                                      kbatch,
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      stride_C,
+                                      batch_stride_A,
+                                      batch_stride_B,
+                                      batch_stride_C,
+                                      batch_count};

    float ave_time = batched_gemm<ADataType,
                                  BDataType,
--- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -1,2 +1 @@
 add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
-add_executable(tile_example_grouped_gemm_tileloop EXCLUDE_FROM_ALL grouped_gemm_tileloop.cpp)
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <hip/hip_runtime.h>

@@ -16,19 +16,11 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"

-template <typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
-                   const ck_tile::stream_config& s,
-                   void* kargs_ptr)
+template <typename ALayout, typename BLayout, typename CLayout>
+float grouped_gemm_tileloop(const ck_tile::stream_config& s,
+                            const ck_tile::index_t num_groups,
+                            void* kargs_ptr,
+                            bool splitk)
 {
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
    // Memory friendly for Interwave scheduler
@@ -83,8 +75,6 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
    constexpr bool kPadN = false;
    constexpr bool kPadK = false;

-    constexpr bool TransposeC = false;
-
    constexpr int kBlockPerCu                         = 1;
    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
    constexpr ck_tile::index_t TileParitionerM01      = 4;
@@ -97,54 +87,41 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;

    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
-                                                                 kPadN,
-                                                                 kPadK,
-                                                                 DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 CLayout,
-                                                                 TransposeC>;
+    using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<kPadM,
+                                                                           kPadN,
+                                                                           kPadK,
+                                                                           DoubleSmemBuffer,
+                                                                           ALayout,
+                                                                           BLayout,
+                                                                           CLayout>;
    using GemmPipelineProblem =
        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;

-    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
-
-    const ck_tile::index_t k_grain     = gemm_descs[0].k_batch * K_Tile;
-    const ck_tile::index_t K_split     = (gemm_descs[0].K + k_grain - 1) / k_grain * K_Tile;
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-
    float ave_time{0};

-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_,
-                         const auto memory_operation_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
+    const auto Run = [&](const auto memory_operation_) {
        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
        constexpr auto memory_operation = memory_operation_.value;

+        // We create the GEMM pipeline without specifying hotloop or tailnumber.
+        // These are automatically run inside the kernel based on the given input data.
        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                           BDataType,
                                                                           AccDataType,
                                                                           GemmShape,
                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
+                                                                           scheduler>;

        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
        using GemmEpilogue = ck_tile::CShuffleEpilogue<
            ck_tile::CShuffleEpilogueProblem<ADataType,
                                             BDataType,
-                                             DsDataType,
+                                             ck_tile::tuple<>,
                                             AccDataType,
                                             CDataType,
-                                             DsLayout,
+                                             ck_tile::tuple<>,
                                             CLayout,
-                                             CDEElementWise,
+                                             ck_tile::element_wise::PassThrough,
                                             GemmPipelineProblem::kBlockSize,
                                             TilePartitioner::MPerBlock,
                                             TilePartitioner::NPerBlock,
@@ -156,27 +133,14 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                                             UniversalGemmProblem::TransposeC,
                                             memory_operation>>;
        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKargs(gemm_descs);
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Kernel arguments not supported!");
-        }
-
        constexpr dim3 blocks = Kernel::BlockSize();
-        const dim3 grids      = Kernel::GridSize(gemm_descs);
-
-        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
-                                            kargs.data(),
-                                            get_workspace_size(gemm_descs),
-                                            hipMemcpyHostToDevice,
-                                            s.stream_id_));
+        const dim3 grids      = Kernel::MaxOccupancyGridSize(s);

        if(s.log_level_ > 0)
        {
-            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
+                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
+                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
        }

        ave_time =
@@ -187,45 +151,26 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                                       blocks,
                                       0,
                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
-                                       gemm_descs.size()));
+                                       num_groups));

        return ave_time;
    };

-    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
-        if(gemm_descs[0].k_batch == 1)
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::set>{});
-        }
-        else
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::atomic_add>{});
-        }
-    };
-
-    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    if(!splitk)
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::atomic_add>{});
+    }

    return ave_time;
 }

 #include "run_grouped_gemm_example.inc"

-constexpr bool Persistent = false;
-int main(int argc, char* argv[])
-{
-    try
-    {
-        return !run_grouped_gemm_example<Persistent>(argc, argv);
-    }
-    catch(const std::runtime_error& e)
-    {
-        std::cerr << "Runtime error: " << e.what() << '\n';
-        return EXIT_FAILURE;
-    }
-}
+constexpr bool Persistent = true;
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -15,7 +15,7 @@
 #define CK_TILE_PIPELINE_COMPUTE_V4 3

 #ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V4
 #endif

 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
@@ -54,7 +54,7 @@ using BDataType   = Types::BDataType;
 using AccDataType = Types::AccDataType;
 using CDataType   = Types::CDataType;

-using grouped_gemm_kargs = ck_tile::GemmHostArgs</*NumDTensor = 0*/>;
+using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;

 auto create_args(int argc, char* argv[])
 {
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
@@ -1,177 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <memory>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/epilogue.hpp"
-#include "ck_tile/ops/gemm.hpp"
-#include "ck_tile/host.hpp"
-#include "grouped_gemm.hpp"
-
-template <typename ALayout, typename BLayout, typename CLayout>
-float grouped_gemm_tileloop(const ck_tile::stream_config& s,
-                            const ck_tile::index_t num_groups,
-                            void* kargs_ptr,
-                            bool splitk)
-{
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 32;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 1;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
-
-    constexpr bool DoubleSmemBuffer = false;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = true;
-#endif
-
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu                         = 1;
-    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
-    constexpr ck_tile::index_t TileParitionerM01      = 4;
-
-    using GemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-    using TilePartitioner = ck_tile::
-        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
-
-    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<kPadM,
-                                                                           kPadN,
-                                                                           kPadK,
-                                                                           DoubleSmemBuffer,
-                                                                           ALayout,
-                                                                           BLayout,
-                                                                           CLayout>;
-    using GemmPipelineProblem =
-        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
-
-    float ave_time{0};
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
-        constexpr auto memory_operation = memory_operation_.value;
-
-        // We create the GEMM pipeline without specifying hotloop or tailnumber.
-        // These are automatically run inside the kernel based on the given input data.
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler>;
-
-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             ck_tile::tuple<>,
-                                             AccDataType,
-                                             CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             ck_tile::element_wise::PassThrough,
-                                             GemmPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC,
-                                             memory_operation>>;
-        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        constexpr dim3 blocks = Kernel::BlockSize();
-        const dim3 grids      = Kernel::MaxOccupancyGridSize(s);
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
-
-        ave_time =
-            ck_tile::launch_kernel(s,
-                                   ck_tile::make_kernel<blocks.x, kBlockPerCu>(
-                                       Kernel{},
-                                       grids,
-                                       blocks,
-                                       0,
-                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
-                                       num_groups));
-
-        return ave_time;
-    };
-
-    if(!splitk)
-    {
-        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                       ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                       ck_tile::memory_operation_enum::atomic_add>{});
-    }
-
-    return ave_time;
-}
-
-#include "run_grouped_gemm_example.inc"
-
-constexpr bool Persistent = true;
-int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -83,18 +83,18 @@ float invoke_gemm(int n_warmup,
        const bool splitk = args[0].k_batch > 1;
        for(const auto& arg : args)
        {
-            kargs.emplace_back(ck_tile::GemmKernelArgs<>{arg.a_ptr,
-                                                         arg.b_ptr,
-                                                         {},
-                                                         arg.e_ptr,
-                                                         arg.M,
-                                                         arg.N,
-                                                         arg.K,
-                                                         arg.stride_A,
-                                                         arg.stride_B,
-                                                         {},
-                                                         arg.stride_E,
-                                                         arg.k_batch});
+            kargs.emplace_back(ck_tile::UniversalGemmKernelArgs<>{{arg.a_ptr},
+                                                                  {arg.b_ptr},
+                                                                  {/*arg.ds_ptr*/},
+                                                                  arg.e_ptr,
+                                                                  arg.M,
+                                                                  arg.N,
+                                                                  arg.K,
+                                                                  {arg.stride_A},
+                                                                  {arg.stride_B},
+                                                                  {/*arg.stride_Ds*/},
+                                                                  arg.stride_E,
+                                                                  arg.k_batch});
        }
        const auto stream = ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat};
        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
@@ -216,9 +216,9 @@ int run_grouped_gemm_example_with_layouts(int argc,
        c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>(
            ck_tile::host_tensor_descriptor(M, N, stride_Cs[i], is_row_major(CLayout{}))));

-        std::cout << "gemm[" << i << "]"
-                  << " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc
-                  << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl;
+        std::cout << "gemm[" << i << "]" << " a_m_k: " << a_m_k_tensors[i].mDesc
+                  << " b_k_n: " << b_k_n_tensors[i].mDesc << " c_m_n: " << c_m_n_tensors[i].mDesc
+                  << std::endl;

        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);
@@ -240,7 +240,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
        void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();

        gemm_descs.push_back(
-            {p_a, p_b, {}, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], {}, stride_Cs[i]});
+            {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
    }

    invoke_gemm<ADataType,
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -168,7 +168,7 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c
                    hipGetErrorString(hipMemsetAsync(
                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
            };
-            ave_time = ck_tile::launch_kernel_preprocess(
+            ave_time = ck_tile::launch_kernel_time_mask(
                s,
                run_flush_cache,
                ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
@@ -219,6 +219,7 @@ int run_flatmm_example(int argc, char* argv[])
    std::string b_layout  = arg_parser.get_str("b_layout");
    if(a_layout == "R" && b_layout == "C")
    {
+
        if(data_type == "fp16")
        {
            run_flatmm_example_with_layouts<ck_tile::half_t, FlatmmConfig<ck_tile::half_t>>(
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
@@ -157,7 +157,7 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
                                                 UniversalGemmProblem::TransposeC,
                                                 memory_operation>>;

-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            using Kernel = ck_tile::GemmKernelMultiD<TilePartitioner, GemmPipeline, GemmEpilogue>;
            auto kargs   = Kernel::MakeKernelArgs(args);

            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
@@ -170,10 +170,9 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&

            if(s.log_level_ > 0)
            {
-                std::cout << "Launching kernel with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
+                std::cout << "Launching kernel with args:" << " grid: {" << grids.x << ", "
+                          << grids.y << ", " << grids.z << "}" << ", blocks: {" << blocks.x << ", "
+                          << blocks.y << ", " << blocks.z << "}" << std::endl;
            }

            ave_time = ck_tile::launch_kernel(
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
@@ -64,7 +64,7 @@ auto create_args(int argc, char* argv[])
    return std::make_tuple(result, arg_parser);
 }

-using gemm_multi_d_kargs = ck_tile::GemmHostArgs<DsDataType::size()>;
+using gemm_multi_d_kargs = ck_tile::GemmMultiDHostArgs<DsDataType::size()>;

 template <typename ADataType,
          typename BDataType,
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -1,4 +1,8 @@
-add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
 set(EXAMPLE_CONV_COMPILE_OPTIONS)
 list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+
+add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
 target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp)
+target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "grouped_convolution_utils.hpp"
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DsDataType     = ck_tile::tuple<>,
+          typename DsLayout       = ck_tile::tuple<>,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
+                              const ck_tile::stream_config& s)
+{
+    constexpr int kBlockPerCu = 1;
+
+    constexpr ck_tile::index_t M_Tile = 64;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr ck_tile::index_t VectorSizeA = 8;
+    constexpr ck_tile::index_t VectorSizeB = 8;
+    constexpr ck_tile::index_t VectorSizeC = 8;
+
+    // Implicit GEMM Traits
+    using CodegenShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
+    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+    using GroupedConvTraitsType =
+        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
+    using CodegenPipelineProblem =
+        ck_tile::GemmPipelineProblem<InDataType,
+                                     WeiDataType,
+                                     AccDataType,
+                                     CodegenShape,
+                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
+                                     InDataType,
+                                     true,
+                                     VectorSizeA,
+                                     VectorSizeB>;
+    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using ConvEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<InDataType,
+                                             WeiDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                                             ck_tile::tensor_layout::gemm::RowMajor,
+                                             CDEElementWise,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation,
+                                             1,
+                                             true,
+                                             VectorSizeC>>;
+
+        using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
+                                                                       TilePartitioner,
+                                                                       CodegenPipeline,
+                                                                       ConvEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(kargs);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << '\n'
+                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel_time_mask(
+            s,
+            Kernel::Preprocess(kargs, s),
+            ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    if(args.k_batch == 1)
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::atomic_add>{});
+    }
+}
+
+#include "run_grouped_convolution_bwd_weight_example.inc"
+
+template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
+int run_grouped_conv_bwd_weight_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<1>{},
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<2>{},
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<3>{},
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
+
+int run_grouped_conv_bwd_weight_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type  = arg_parser.get_str("prec");
+    std::string in_layout  = arg_parser.get_str("in_layout");
+    std::string wei_layout = arg_parser.get_str("wei_layout");
+    std::string out_layout = arg_parser.get_str("out_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_weight_example(argc, argv); }
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -23,7 +23,7 @@ template <ck_tile::index_t NDimSpatial,
          typename DsDataType     = ck_tile::tuple<>,
          typename DsLayout       = ck_tile::tuple<>,
          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::stream_config& s)
+float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_tile::stream_config& s)
 {
    constexpr int kBlockPerCu = 1;

@@ -97,7 +97,7 @@ float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::
                                                                ConvEpilogue>;
        auto kargs   = Kernel::MakeKernelArgs(args);

-        const dim3 grids      = Kernel::GridSize(args);
+        const dim3 grids      = Kernel::GridSize(kargs);
        constexpr dim3 blocks = Kernel::BlockSize();

        if(!Kernel::IsSupportedArgument(kargs))
@@ -129,7 +129,7 @@ float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::
                                          ck_tile::memory_operation_enum::set>{});
 }

-#include "run_grouped_convolution_example.inc"
+#include "run_grouped_convolution_fwd_example.inc"

 template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
 int run_grouped_conv_fwd_example_prec_type(
@@ -185,7 +185,7 @@ int run_grouped_conv_fwd_example(int argc, char* argv[])

    std::string data_type  = arg_parser.get_str("prec");
    std::string in_layout  = arg_parser.get_str("in_layout");
-    std::string wei_layout = arg_parser.get_str("weight_layout");
+    std::string wei_layout = arg_parser.get_str("wei_layout");
    std::string out_layout = arg_parser.get_str("out_layout");

    if(data_type == "fp16")
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
@@ -12,6 +12,28 @@
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/grouped_convolution.hpp"

+template <typename InDataType, typename WeiDataType, typename AccDataType, typename OutDataType>
+auto calculate_rtol_atol(const ck_tile::index_t GemmK,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(InDataType) < sizeof(WeiDataType), InDataType, WeiDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, OutDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(GemmK, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, OutDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<OutDataType, OutDataType, OutDataType>(kbatch);
+    const auto atol_split_k =
+        ck_tile::get_absolute_threshold<OutDataType, OutDataType, OutDataType>(
+            max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
 ck_tile::index_t fill_spatial_dimensions(std::vector<ck_tile::index_t>& filter_spatial_lengths,
                                         std::vector<ck_tile::index_t>& image_spatial_lengths,
                                         std::vector<ck_tile::index_t>& strides,
@@ -90,7 +112,7 @@ auto create_args(int argc, char* argv[])
        .insert("rpad_w", "0", "right pad for w dimension")

        .insert("in_layout", "NHWGC", "Input image layout - NHWGC by default")
-        .insert("weight_layout", "GKYXC", "Weight layout - GKYXC by default")
+        .insert("wei_layout", "GKYXC", "Weight layout - GKYXC by default")
        .insert("out_layout", "NHWGK", "Output image layout - NHWGK by default")
        .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
@@ -105,4 +127,5 @@ auto create_args(int argc, char* argv[])
 }

 // host API
-float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::stream_config& s);
+float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
+                       const ck_tile::stream_config& s);
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+float invoke_grouped_conv_bwd_weight(ck_tile::GroupedConvBwdWeightHostArgs& args,
+                                     int n_warmup,
+                                     int n_repeat)
+{
+    float ave_time = grouped_conv_bwd_weight<NDimSpatial,
+                                             InDataType,
+                                             WeiDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             InLayout,
+                                             WeiLayout,
+                                             OutLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop     = args.GetFlops();
+    std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops         = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec     = num_byte / 1.E6 / ave_time;
+
+    std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType = InDataType,
+          typename OutDataType = InDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+int run_grouped_conv_bwd_weight_example_with_layouts(
+    int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout)
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using AccDataType = float;
+
+    std::vector<ck_tile::index_t> filter_spatial_lengths;
+    std::vector<ck_tile::index_t> image_spatial_lengths;
+    std::vector<ck_tile::index_t> strides;
+    std::vector<ck_tile::index_t> dilations;
+    std::vector<ck_tile::index_t> lpads;
+    std::vector<ck_tile::index_t> rpads;
+
+    const ck_tile::index_t num_dim_sp = fill_spatial_dimensions(filter_spatial_lengths,
+                                                                image_spatial_lengths,
+                                                                strides,
+                                                                dilations,
+                                                                lpads,
+                                                                rpads,
+                                                                arg_parser);
+
+    ck_tile::conv::ConvParam conv_param{num_dim_sp,
+                                        arg_parser.get_int("g"),
+                                        arg_parser.get_int("n"),
+                                        arg_parser.get_int("k"),
+                                        arg_parser.get_int("c"),
+                                        filter_spatial_lengths,
+                                        image_spatial_lengths,
+                                        strides,
+                                        dilations,
+                                        lpads,
+                                        rpads};
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    const auto in_g_n_c_wis_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input(in_g_n_c_wis_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    ck_tile::HostTensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{-1.f, 1.f}(input);
+        ck_tile::FillUniformDistribution<OutDataType>{-1.f, 1.f}(output);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<InDataType>{}(input);
+        ck_tile::FillMonotonicSeq<OutDataType>{}(output);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{1.f, 1.f}(input);
+        ck_tile::FillUniformDistribution<OutDataType>{1.f, 1.f}(output);
+    }
+    else
+    {
+        input.SetZero();
+        output.SetZero();
+    }
+
+    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+    input_dev_buf.ToDevice(input.data());
+    weight_dev_buf.SetZero();
+    output_dev_buf.ToDevice(output.data());
+
+    ck_tile::GroupedConvBwdWeightHostArgs args(conv_param,
+                                               input_dev_buf.GetDeviceBuffer(),
+                                               weight_dev_buf.GetDeviceBuffer(),
+                                               {},
+                                               output_dev_buf.GetDeviceBuffer(),
+                                               kbatch);
+
+    std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    invoke_grouped_conv_bwd_weight<NDimSpatial,
+                                   InDataType,
+                                   WeiDataType,
+                                   AccDataType,
+                                   OutDataType,
+                                   InLayout,
+                                   WeiLayout,
+                                   OutLayout>(args, n_warmup, n_repeat);
+
+    weight_dev_buf.FromDevice(weight.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<WeiDataType> weight_host_ref(wei_g_k_c_xs_desc);
+        weight_host_ref.SetZero();
+
+        ck_tile::
+            reference_grouped_conv_bwd_weight<NDimSpatial, InDataType, WeiDataType, OutDataType>(
+                input,
+                weight_host_ref,
+                output,
+                conv_param.conv_filter_strides_,
+                conv_param.conv_filter_dilations_,
+                conv_param.input_left_pads_,
+                conv_param.input_right_pads_);
+        const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
+        const float max_accumulated_value =
+            *std::max_element(weight_host_ref.mData.begin(), weight_host_ref.mData.end());
+        const auto rtol_atol =
+            calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
+                GemmK, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(weight,
+                                  weight_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        throw std::runtime_error("Unsupported gpu verification !!!");
+    }
+
+    return pass;
+}
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
@@ -2,28 +2,6 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

-template <typename InDataType, typename WeiDataType, typename AccDataType, typename OutDataType>
-auto calculate_rtol_atol(const ck_tile::index_t GemmK,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(InDataType) < sizeof(WeiDataType), InDataType, WeiDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, OutDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(GemmK, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, OutDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<OutDataType, OutDataType, OutDataType>(kbatch);
-    const auto atol_split_k =
-        ck_tile::get_absolute_threshold<OutDataType, OutDataType, OutDataType>(
-            max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
 template <ck_tile::index_t NDimSpatial,
          typename InDataType,
          typename WeiDataType,
@@ -32,7 +10,9 @@ template <ck_tile::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename OutLayout>
-float invoke_grouped_conv_fwd(ck_tile::GroupedConvHostArgs& args, int n_warmup, int n_repeat)
+float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
+                              int n_warmup,
+                              int n_repeat)
 {
    float ave_time = grouped_conv_fwd<NDimSpatial,
                                      InDataType,
@@ -143,12 +123,12 @@ int run_grouped_conv_fwd_example_with_layouts(
    weight_dev_buf.ToDevice(weight.data());
    output_dev_buf.SetZero();

-    ck_tile::GroupedConvHostArgs args(conv_param,
-                                      input_dev_buf.GetDeviceBuffer(),
-                                      weight_dev_buf.GetDeviceBuffer(),
-                                      {},
-                                      output_dev_buf.GetDeviceBuffer(),
-                                      kbatch);
+    ck_tile::GroupedConvFwdHostArgs args(conv_param,
+                                         input_dev_buf.GetDeviceBuffer(),
+                                         weight_dev_buf.GetDeviceBuffer(),
+                                         {},
+                                         output_dev_buf.GetDeviceBuffer(),
+                                         kbatch);

    std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
    std::cout << "input: " << input.mDesc << std::endl;
--- a/example/ck_tile/21_elementwise/CMakeLists.txt
+++ b/example/ck_tile/21_elementwise/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Elementwise example targets 2D inputs
+set(TARGET_NAME_2D_INPUT tile_example_elementwise)
+add_executable(${TARGET_NAME_2D_INPUT} elementwise_example.cpp)
+
+# Elementwise unary example targets 2D inputs
+set(TARGET_NAME_2D_INPUT_UNARY tile_example_elementwise_unary)
+add_executable(${TARGET_NAME_2D_INPUT_UNARY} elementwise_example_unary.cpp)
+
+# Elementwise transpose example targets 2D inputs
+set(TARGET_NAME_2D_INPUT_TRANSPOSE tile_example_elementwise_transpose)
+add_executable(${TARGET_NAME_2D_INPUT_TRANSPOSE} elementwise_example_transpose.cpp)
+
+# Elementwise example targets 4D inputs
+set(TARGET_NAME_4D_INPUT tile_example_elementwise_add_4d)
+add_executable(${TARGET_NAME_4D_INPUT} elementwise_example_add_4d.cpp)
--- a/example/ck_tile/21_elementwise/elementwise_example.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example.cpp
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "1024", "m dimension")
+        .insert("n", "1024", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t M      = arg_parser.get_int("m");
+    ck_tile::index_t N      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+
+    // If stride is negative (default -1), set it to N, assuming a dense row-major layout.
+    if(stride < 0)
+        stride = N;
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    if(stride < N)
+    {
+        throw std::runtime_error("stride must be >= N");
+    }
+
+    // Define type aliases for clarity.
+    // XDataType: Data type of the input tensors.
+    // ComputeDataType: Data type used for intermediate computations (often float for precision).
+    // YDataType: Data type of the output tensor.
+    // XElementwiseOperation: The specific elementwise operation to perform (e.g., Add, Mul).
+    using XDataType = DataType;
+    using ComputeDataType =
+        float; // Using float for intermediate calculations can improve numerical stability.
+    using YDataType             = DataType;
+    using XElementwiseOperation = ck_tile::element_wise::Add;
+
+    // 1. Initialize the input data on the host (CPU).
+    // HostTensor is a utility to manage tensor data on the CPU.
+    // The first argument is the shape (dimensions) of the tensor {M, N}.
+    // The second argument is the strides {stride, 1} for row-major layout.
+    // 'x_host_a' and 'x_host_b' are the two input tensors for the elementwise operation.
+    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host_b({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});
+
+    std::vector<ck_tile::index_t> shape = {M, N};
+
+    // Fill the host tensors with random data.
+    // FillUniformDistribution populates the tensor with values from a uniform distribution,
+    // within an interval.
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_b);
+
+    // 2. Create device memory buffers
+    // DeviceMem allocates memory on the GPU.
+    // The size is determined by the total number of elements and the size of DataType.
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
+
+    // Copy data from host input tensors to device buffers.
+    x_buf_a.ToDevice(x_host_a.data());
+    x_buf_b.ToDevice(x_host_b.data());
+
+    // 3. Configure the kernel execution parameters.
+    // Dividing the problem into blocktile, blockwarp and warptile
+    // The blocktile is the size of the tile processed by a single work group (also called thread
+    // block). The warptile is the size of the tile processed by a single wavefront (also called
+    // warp). The vector is the size of the tile processed by a single work item (also called
+    // thread). The problem is divided into blocks of size BlockTile. Each block is further divided
+    // into wavefronts of size WarpTile. Each wavefront is composed of 64 work items (on AMD; 32
+    // threads on NVIDIA). Each work item in a wavefront processes one vector's worth of elements.
+    // Note that WarpTile/Vector should be 64 for CDNA (because there are 64 work items per
+    // wavefront). Vector size is set to be 16 / sizeof(ComputeDataType), to maximize vectorization.
+    using BlockTile = ck_tile::sequence<2048>; // How many elements are handled by a block tile (the
+                                               // tensor is divided into blocks of this size)
+    using BlockWarps = ck_tile::sequence<8>; // How many concurrent wavefronts are in a block (each
+                                             // wavefront will cover some part of the block tile)
+
+    // WarpTile: Defines the size of the data sub-tile processed by a single wavefront.
+    // This should be consistent with BlockTile and BlockWarps.
+    // If BlockTile is 2048 and BlockWarps is 8, then WarpTile could be 2048/8 = 256.
+    // However, this example uses 64, meaning each wavefront processes 64 elements, and multiple
+    // such wavefront operations might be needed to cover the BlockTile, or the BlockTile is
+    // distributed differently.
+    // The current configuration (BlockTile=2048, BlockWarps=8, WarpTile=64) implies that
+    // each wavefront processes 64 elements, and 8 wavefronts process 8*64 = 512 elements
+    // concurrently. Since 512 is not equal to 2048, it means that warptile(s) will need to iterate
+    // over multiple times over different set of elements to cover the entire BlockTile.
+    using WarpTile = ck_tile::sequence<64>;
+
+    // 4. Create the kernel
+
+    // ElementWiseShape bundles these tiling parameters.
+    // It calculates derived properties like threads per wavefront, repeats, vectorization and total
+    // block size.
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+
+    // ElementWisePipelineProblem encapsulates all necessary information for the elementwise kernel:
+    // - Data types (input, compute, output).
+    // - Shape traits (tiling configuration).
+    // - The specific elementwise operation (e.g., Add).
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        ComputeDataType,
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    // ElementWiseKernel refers to the GPU kernel class
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    // Compute flattened size
+    ck_tile::index_t total_elements = 1;
+    for(auto d : shape)
+        total_elements *= d;
+
+    // kBlockSize: The number of work items in a GPU workgroup (thread block).
+    // This is often a multiple of the wavefront size, 64 on CDNA.
+    // Here, it's explicitly set to 512. This should be consistent with Shape::kBlockSize.
+    // Shape::kBlockSize would be BlockWarps * warpSize (e.g., 8 * 64 = 512).
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+
+    // kBlockPerCu: Hint for how many workgroups can be scheduled per Compute Unit (CU).
+    // This can influence occupancy and performance.
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    // kGridSize: Calculates the total number of workgroups required to process all elements.
+    // Each workgroup is responsible for 'elements_per_block' elements.
+    // To ensure all elements are covered, especially when 'total_elements' is not perfectly
+    // divisible by 'elements_per_block', using ceiling division.
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "grid size = " << kGridSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
+                                             static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
+
+    auto input_size = ck_tile::make_tuple(M, N);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(input_size))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // 4. Run the kernel
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,
+                                       input_size,
+                                       ck_tile::make_tuple(N, 1), // Input Stride
+                                       ck_tile::make_tuple(N, 1), // Output Stride
+                                       input_tensors,
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // 5. Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data());
+        auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
+
+        ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
+            x_host_a, x_host_b, y_host, op);
+
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
--- a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("dim0", "4", "dimension 0")
+        .insert("dim1", "16", "dimension 1")
+        .insert("dim2", "32", "dimension 2")
+        .insert("dim3", "32", "dimension 3")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t D0 = arg_parser.get_int("dim0");
+    ck_tile::index_t D1 = arg_parser.get_int("dim1");
+    ck_tile::index_t D2 = arg_parser.get_int("dim2");
+    ck_tile::index_t D3 = arg_parser.get_int("dim3");
+
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    using XDataType = DataType;
+    using ComputeDataType =
+        float; // Using float for intermediate calculations can improve numerical stability.
+    using YDataType             = DataType;
+    using XElementwiseOperation = ck_tile::element_wise::Add;
+
+    // Initialize the input data on the host (CPU).
+    std::vector<ck_tile::index_t> problem_shape = {D0, D1, D2, D3};
+
+    std::vector<ck_tile::index_t> host_strides(4);
+    host_strides[3] = 1;
+    host_strides[2] = problem_shape[3];
+    host_strides[1] = problem_shape[2] * problem_shape[3];
+    host_strides[0] = problem_shape[1] * problem_shape[2] * problem_shape[3];
+
+    ck_tile::HostTensor<XDataType> x_host_a(problem_shape, host_strides);
+    ck_tile::HostTensor<XDataType> x_host_b(problem_shape, host_strides);
+    ck_tile::HostTensor<YDataType> y_host(problem_shape, host_strides);
+    ck_tile::HostTensor<YDataType> y_validation(problem_shape, host_strides);
+
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+    ck_tile::FillUniformDistribution<XDataType>{2.f, 10.f}(x_host_b);
+
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
+
+    x_buf_a.ToDevice(x_host_a.data());
+    x_buf_b.ToDevice(x_host_b.data());
+
+    using BlockTile  = ck_tile::sequence<256>;
+    using BlockWarps = ck_tile::sequence<1>;
+    using WarpTile   = ck_tile::sequence<256>;
+
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        ComputeDataType,
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    ck_tile::index_t total_elements = 1;
+    for(auto d : problem_shape)
+        total_elements *= d;
+
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+
+    constexpr ck_tile::index_t kBlockPerCu = 2;
+
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "grid size = " << kGridSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
+                                             static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
+
+    auto problem_shape_tuple =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+
+    auto strides_tuple =
+        ck_tile::make_tuple(host_strides[0], host_strides[1], host_strides[2], host_strides[3]);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(problem_shape_tuple))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // Run the kernel
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+            Kernel{},
+            kGridSize,
+            kBlockSize,
+            0,
+            problem_shape_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t>
+            strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for input strides
+            strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for output strides
+            input_tensors,
+            static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data());
+        auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
+
+        ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
+            x_host_a, x_host_b, y_host, op);
+
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
--- a/Show More
+++ b/Show More