Merge remote-tracking branch 'origin/develop' into samremes/bmatrix_2d_blockscale

2026-06-30 11:47:48 +00:00 · 2025-10-27 15:17:14 +00:00
parent 2d86cd0081 06973b1cf4
commit 470d6e4df4
507 changed files with 54586 additions and 14123 deletions
--- a/.github/scripts/therock_configure_ci.py
+++ b/.github/scripts/therock_configure_ci.py
@@ -6,6 +6,7 @@ import subprocess
 import sys
 from typing import Iterable, Optional, Mapping

+
 def gha_set_output(vars: Mapping[str, str | Path]):
    """Sets values in a step's output parameters.

@@ -25,6 +26,7 @@ def gha_set_output(vars: Mapping[str, str | Path]):
    with open(step_output_file, "a") as f:
        f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items())

+
 def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
    """Returns the paths of modified files relative to the base reference."""
    try:
@@ -42,11 +44,13 @@ def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
            file=sys.stderr,
        )
        return None
-    
+
+
 GITHUB_WORKFLOWS_CI_PATTERNS = [
    "therock*",
 ]

+
 def is_path_workflow_file_related_to_ci(path: str) -> bool:
    return any(
        fnmatch.fnmatch(path, ".github/workflows/" + pattern)
@@ -56,11 +60,13 @@ def is_path_workflow_file_related_to_ci(path: str) -> bool:
        for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
    )

+
 def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
    if paths is None:
        return False
    return any(is_path_workflow_file_related_to_ci(p) for p in paths)

+
 # Paths matching any of these patterns are considered to have no influence over
 # build or test workflows so any related jobs can be skipped if all paths
 # modified by a commit/PR match a pattern in this list.
@@ -70,23 +76,26 @@ SKIPPABLE_PATH_PATTERNS = [
    "*.md",
    "*.pre-commit-config.*",
    "*LICENSE",
-    'Jenkinsfile',
-    '.github/ISSUE_TEMPLATE/*',
-    '.github/CODEOWNERS',
-    '.github/*.md',
-    '.github/dependabot.yml',
+    "Jenkinsfile",
+    ".github/ISSUE_TEMPLATE/*",
+    ".github/CODEOWNERS",
+    ".github/*.md",
+    ".github/dependabot.yml",
 ]

+
 def is_path_skippable(path: str) -> bool:
    """Determines if a given relative path to a file matches any skippable patterns."""
    return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS)

+
 def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool:
    """Returns true if at least one path is not in the skippable set."""
    if paths is None:
        return False
    return any(not is_path_skippable(p) for p in paths)

+
 def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
    """Returns true if CI workflows should run given a list of modified paths."""

@@ -118,16 +127,16 @@ def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
        )
        return False

+
 def main(args):
    base_ref = args.get("base_ref")
    modified_paths = get_modified_paths(base_ref)
    print("modified_paths (max 200):", modified_paths[:200])
    enable_jobs = should_ci_run_given_modified_paths(modified_paths)
-    output = {
-        'enable_therock_ci': json.dumps(enable_jobs)
-    }
+    output = {"enable_therock_ci": json.dumps(enable_jobs)}
    gha_set_output(output)

+
 if __name__ == "__main__":
    args = {}
    args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1")
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,16 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [develop]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v3
+      with:
+        python-version: '3.12'
+    - uses: pre-commit/action@v3.0.1
--- a/.github/workflows/therock-ci-linux.yml
+++ b/.github/workflows/therock-ci-linux.yml
@@ -20,7 +20,7 @@ jobs:
    permissions:
      id-token: write
    container:
-      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:044b113562629f4bd2ec5d2e64b32eee11562d48fb1a75d7493daec9dd8d8292
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:2f3ebd0beb04c449fdb36933e54bdc69483b914fb9005594d3fc9444c206b54b
      options: -v /runner/config:/home/awsconfig/
    env:
      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
@@ -35,6 +35,15 @@ jobs:
        with:
          repository: "ROCm/rocm-libraries"

+      - name: Pull DVC files for rocm-libraries # LOGNAME details here https://github.com/ROCm/rocm-libraries/pull/1617
+        run: |
+          if command -v dvc &> /dev/null; then
+            echo "dvc detected"
+          else
+            echo "Warning, dvc not detected!"
+          fi
+          LOGNAME=github-runner dvc pull -v
+
      - name: Checkout composable_kernel repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
@@ -44,7 +53,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "ROCm/TheRock"
-          ref: dc05d637054ad197c84b00e24b6262af0ec797c6 # 10-03-2025 commit
+          ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit
          path: "TheRock"

      - name: Setup ccache
--- a/.github/workflows/therock-test-component.yml
+++ b/.github/workflows/therock-test-component.yml
@@ -29,7 +29,7 @@ jobs:
        --group-add video
        --device /dev/kfd
        --device /dev/dri
-        --group-add 992
+        --group-add 110
        --env-file /etc/podinfo/gha-gpu-isolation-settings
    strategy:
      fail-fast: false
@@ -51,6 +51,7 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          repository: "ROCm/TheRock"
+          ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit

      - name: Run setup test environment workflow
        uses: './.github/actions/setup_test_environment'
--- a/.github/workflows/therock-test-packages.yml
+++ b/.github/workflows/therock-test-packages.yml
@@ -27,6 +27,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "ROCm/TheRock"
+          ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit

      - name: "Configuring CI options"
        env:
--- a/.gitignore
+++ b/.gitignore
@@ -36,7 +36,7 @@ tags
 # Editors
 .vscode

-# build-in-source directory
+# build-in-source directory (see exceptions below)
 build*

 # emacs temporary/backup files
@@ -58,7 +58,7 @@ _doxygen/
 docs/doxygen/html
 docs/doxygen/xml

-# JetBrains IDE
+# JetBrains IDE (see build* exceptions below)
 .idea/
 cmake-build*/
 build*/
@@ -71,3 +71,7 @@ __pycache__/

 .cache/

+# Exceptions to build* patterns above
+# The experimental/builder directory should be tracked despite matching build*
+!experimental/builder
+!experimental/builder/**
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,11 +1,25 @@
 repos:
-   repo: local
+-   repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.3
    hooks:
    -   id: clang-format
-        name: clang-format
-        entry: clang-format-18 -i --style=file
-        language: system
        types_or: [c++, inc]
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.0
+    hooks:
+    -   id: ruff-check
+        args: [ --fix ]
+        exclude: |
+            (?x)^(
+                docs/conf.py
+            )$
+    -   id: ruff-format
+        exclude: |
+            (?x)^(
+                docs/conf.py
+            )$
+-   repo: local
+    hooks:
    # -   id: copyright-year-checker
    #     name: copyright-year-checker
    #     entry: script/check_copyright_year.sh
@@ -18,21 +32,12 @@ repos:
        language: script
        types_or: [c++, text]
        verbose: true
-    -   id: ruff-check
-        name: Ruff Linter
-        entry: ruff check --fix
+    -   id: remod-ck-tile
+        name: Run ck_tile remod.py
+        entry: python script/remod_for_ck_tile.py
        language: python
-        types: [python]
-        additional_dependencies: [ruff]
-    -   id: ruff-format
-        name: Ruff Formatter
-        entry: ruff format
-        language: python
-        types: [python]
-        additional_dependencies: [ruff]
-    -   id: run-remod-if-ck-tile-changed
-        name: Run remod.py if ck_tile files changed
-        entry: script/remod_for_ck_tile.sh
-        language: script
-        always_run: true
+        files: '^(include|example)/ck_tile/.*$'
+        additional_dependencies:
+        - dos2unix
+        - clang-format==18.1.3
        pass_filenames: false
--- a/ACRONYMS.md
+++ b/ACRONYMS.md
@@ -0,0 +1,67 @@
+# Acronyms in Composable Kernel
+
+The following acronyms are used in the Composable Kernel codebase:
+
+| Acronym | Expansion | Explanation |
+|---------|-----------|-------------|
+| BF16    | Brain Floating Point 16 | 1 Signed bit, 8 Exponent bits, 7 Significand bits |
+| BF8     | 8-bit Brain Floating Point | 1 Signed bit, 3 Exponent bits, 4 Significand bits |
+| DLA     | Deep Learning Accelerator | Specialized hardware for deep learning workloads |
+| DRAM    | Dynamic Random-Access Memory | Main memory. Global memory on GPU |
+| E2E     | End-to-End | Complete pipeline or process from input to output |
+| ELU     | Exponential Linear Unit | Activation function: $x$ if $x>0$ else $\alpha(e^x-1)$ |
+| FMHA    | Fused Multi-Head Attention | Efficient transformer attention kernel, fusing softmax, masking, and matmul |
+| FP16    | Half-Precision Floating Point | 16-bit IEEE floating point format |
+| FP32    | Single-Precision Floating Point | 32-bit IEEE floating point format |
+| FP64    | Double-Precision Floating Point | 64-bit IEEE floating point format |
+| FP8     | 8-bit Floating Point | Experimental 8-bit floating point format for inference |
+| GEMM    | General Matrix Multiply | Matrix multiplication operation: $C = A \times B$ |
+| GELU    | Gaussian Error Linear Unit | Activation function: $x \cdot \Phi(x)$ |
+| GQA     | Grouped Query Attention | Variant of multi-head attention with grouped queries/keys/values |
+| HBM     | High Bandwidth Memory | Fast memory used in modern GPUs |
+| HIP     | Heterogeneous-Compute Interface for Portability | AMD's CUDA-like GPU programming API |
+| INT8    | 8-bit Integer | Quantized integer format for inference |
+| KVS     | Key-Value Store | Data structure for storing key-value pairs (context: QKV in transformers) |
+| L2/L1   | Level 2/Level 1 Cache | On-chip memory hierarchy in CPUs/GPUs |
+| LDS     | Local Data Share | Shared memory on AMD GPUs (equivalent to CUDA's shared memory) |
+| LLM     | Large Language Model | Transformer-based model for NLP tasks |
+| LSE     | Log-Sum-Exp | Numerically stable softmax computation: $\log(\sum \exp(x))$ |
+| MHA     | Multi-Head Attention | Attention mechanism with multiple heads in transformers |
+| MFMA    | Matrix Fused Multiply-Add | AMD GPU hardware instruction for matrix-matrix multiplication |
+| MoE     | Mixture of Experts | Neural network architecture with multiple expert subnetworks |
+| MQA     | Multi-Query Attention | Variant of multi-head attention with shared keys/values across heads |
+| RCCL    | ROCm Collective Communications Library | AMD Library for multi-GPU communication |
+| NCHW    | Batch, Channel, Height, Width | Tensor layout: batch-major, channels-first |
+| NHWC    | Batch, Height, Width, Channel | Tensor layout: batch-major, channels-last |
+| OOM     | Out Of Memory | Error when memory allocation fails |
+| QAT     | Quantization Aware Training | Training technique for quantized inference |
+| QKV     | Query, Key, Value | Components of transformer attention mechanism |
+| RDMA    | Remote Direct Memory Access | High-speed network memory access |
+| RDQuant | Rowwise Dynamic Quantization | Quantization technique with per-row scaling for int8 inference |
+| ReLU    | Rectified Linear Unit | Activation function: $\max(0, x)$ |
+| ROCm    | Radeon Open Compute | AMD's open GPU computing stack |
+| SGD     | Stochastic Gradient Descent | Optimization algorithm for training neural networks |
+| SM      | Streaming Multiprocessor | GPU compute unit (NVIDIA terminology) |
+| SWA     | Sliding Window Attention | Attention mechanism with a limited window for each token |
+| TLB     | Translation Lookaside Buffer | Memory management unit cache for virtual-to-physical address translation |
+| VGPR    | Vector General Purpose Register | GPU register for vector operations |
+| WARP    | Group of Threads | Smallest scheduling unit on NVIDIA GPUs (32 threads) |
+| WMMA    | Warp Matrix Multiply-Accumulate | NVIDIA's matrix-multiply hardware primitive |
+| XLA     | Accelerated Linear Algebra | Compiler for optimizing ML computations (Google) |
+
+### Common Variable Acronyms in Code
+
+| Symbol | Meaning | Context |
+|--------|---------|---------|
+| M, N, K | Matrix dimensions | GEMM: $A[M,K] \times B[K,N] = C[M,N]$ |
+| Q, K, V | Query, Key, Value | Transformer attention |
+| S       | Sequence length | NLP, transformers |
+| D       | Dimension | Hidden size, feature dim |
+| B       | Batch size | ML batch processing |
+| H       | Head count | Multi-head attention |
+| C       | Channel | CNNs, tensor layouts |
+| T       | Token | NLP, sequence models |
+
+---
+
+If you find an acronym not listed here, please submit a pull request or issue!
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,15 +2,44 @@

 Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).

-## Composable Kernel 1.2.0 for ROCm 7.0.0
+## (Unreleased) Composable Kernel for ROCm
+
+### Added 

-### Added
 * Added a compute async pipeline in the CK TILE universal GEMM on gfx950
 * Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
 * Added the new api to load different memory sizes to SGPR.
 * Added support for B Tensor Preshuffle in CK TILE Grouped GEMM.
 * Added a basic copy kernel example and supporting documentation for new CK Tile developers.
 * Added support for grouped_gemm kernels to perform multi_d elementwise operation.
+* Added support for Multiple ABD GEMM
+* Added benchmarking support for tile engine GEMM Multi D.
+* Added block scaling support in CK_TILE GEMM, allowing flexible use of quantization matrices from either A or B operands.
+* Added the row-wise column-wise quantization for CK_TILE GEMM & CK_TILE Grouped GEMM.
+* Added support for f32 to FMHA (fwd/bwd).
+* Added tensor-wise quantization for CK_TILE GEMM.
+* Added support for batched contraction kernel.
+* Added pooling kernel in CK_TILE
+
+### Changed
+
+* Removed `BlockSize` in `make_kernel` and `CShuffleEpilogueProblem` to support Wave32 in CK_TILE (#2594)
+
+## Composable Kernel 1.1.0 for ROCm 7.1.0
+
+### Added
+
+* Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv/bwd)
+* Added support for elementwise kernel.
+
+### Upcoming changes
+
+* Non-grouped convolutions are deprecated. Their functionality is supported by grouped convolution.
+
+## Composable Kernel 1.1.0 for ROCm 7.0.0
+
+### Added
+
 * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
 * Added a fully asynchronous HOST (CPU) arguments copy flow for CK grouped GEMM kernels.
 * Added support GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW, number of instances in instance factory for NGCHW/GKYXC/NGKHW has been reduced).
@@ -19,38 +48,23 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
 * Added support for Multiple D GEMM
-* Added support for Multiple ABD GEMM
 * Added GEMM pipeline for microscaling (MX) FP8/FP6/FP4 data types
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
 * Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv)
-* Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv/bwd)
 * Added benchmarking support for tile engine GEMM.
 * Added Ping-pong scheduler support for GEMM operation along the K dimension.
 * Added rotating buffer feature for CK_Tile GEMM.
 * Added int8 support for CK_TILE GEMM.
-* Added support for elementwise kernel.
-* Added benchmarking support for tile engine GEMM Multi D.
-* Added block scaling support in CK_TILE GEMM, allowing flexible use of quantization matrices from either A or B operands.
-* Added the row-wise column-wise quantization for CK_TILE GEMM & CK_TILE Grouped GEMM.
-* Added support for f32 to FMHA (fwd/bwd).
-* Added tensor-wise quantization for CK_TILE GEMM.
-* Added pooling kernel in CK_TILE

 ### Optimized

+* Optimize the gemm multiply multiply preshuffle & lds bypass with Pack of KGroup and better instruction layout.
+* Added Vectorize Transpose optimization for CK Tile 
+* Added the asynchronous copy for gfx950

-* Optimize the gemm multiply multiply preshuffle & lds bypass with Pack of KGroup and better instruction layout. (#2166)
-* Added Vectorize Transpose optimization for CK Tile (#2131)
-* Added the asynchronous copy for gfx950 (#2425)
-
-
-### Fixes
-
-None
-
-### Changes
+### Changed

 * Removed support for gfx940 and gfx941 targets (#1944)
 * Replaced the raw buffer load/store intrinsics with Clang20 built-ins (#1876)
@@ -58,15 +72,6 @@ None
 * Number of instances in instance factory for grouped convolution forward NGCHW/GKYXC/NGKHW has been reduced.
 * Number of instances in instance factory for grouped convolution backward weight NGCHW/GKYXC/NGKHW has been reduced.
 * Number of instances in instance factory for grouped convolution backward data NGCHW/GKYXC/NGKHW has been reduced.
-* Removed `BlockSize` in `make_kernel` and `CShuffleEpilogueProblem` to support Wave32 in CK_TILE (#2594)
-
-### Known issues
-
-None
-
-### Upcoming changes
-
-* Non-grouped convolutions are deprecated. All of their functionality is supported by grouped convolution.

 ## Composable Kernel 1.1.0 for ROCm 6.1.0

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,8 +37,14 @@ include(CTest)

 option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON)
 option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
+option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF)
 option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)

+if(CK_EXPERIMENTAL_BUILDER)
+    add_definitions(-DCK_EXPERIMENTAL_BUILDER)
+    include_directories(${PROJECT_SOURCE_DIR}/experimental/builder/include)  
+endif()
+
 # Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
 # CK Codegen requires dataclass which is added in Python 3.7
 # Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
@@ -692,6 +698,10 @@ if (NOT MIOPEN_REQ_LIBS_ONLY)
    add_subdirectory(profiler)
 endif()

+if (CK_EXPERIMENTAL_BUILDER)
+  add_subdirectory(experimental/builder)
+endif()
+
 if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
  add_subdirectory(codegen)
 endif()
--- a/59
+++ b/59
@@ -12,6 +12,14 @@ def show_node_info() {
    """
 }

+// Error patterns to scan build logs for specific failure types and send detailed notifications.
+def failurePatterns = [
+    [pattern: /login attempt to .* failed with status: 401 Unauthorized/, description: "Docker registry authentication failed"],
+    [pattern: /docker login failed/, description: "Docker login failed"],
+    [pattern: /HTTP request sent .* 404 Not Found/, description: "HTTP request failed with 404"],
+    [pattern: /cat: .* No such file or directory/, description: "GPU not found"],
+]
+
 class Version {
    int major, minor, patch
    @Override
@@ -71,7 +79,7 @@ def shouldRunCICheck() {
            '''
        ).trim().split('\n')
        
-        if (changedFiles.isEmpty() || (changedFiles.size() == 1 && changedFiles[0].trim().isEmpty())) {
+        if (changedFiles.size() == 1 && changedFiles[0] == '') {
            echo "No changed files detected - this might be a manual trigger or merge commit, running CI for safety"
            return true
        }
@@ -909,7 +917,7 @@ def run_aiter_tests(Map conf=[:]){
                sh "rocminfo"
                sh "python3 --version"
                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
-                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
+                //sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py" //temporarily disable
                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py"
                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py"
@@ -1039,8 +1047,8 @@ pipeline {
            description: "Use the CK build to verify hipTensor build and tests (default: OFF)")
        string(
            name: 'hipTensor_branch',
-            defaultValue: 'mainline',
-            description: 'Specify which branch of hipTensor to use (default: mainline)')
+            defaultValue: 'develop',
+            description: 'Specify which branch of hipTensor to use (default: develop)')
        booleanParam(
            name: "USE_SCCACHE",
            defaultValue: true,
@@ -1190,7 +1198,6 @@ pipeline {
            when {
                beforeAgent true
                expression { env.SHOULD_RUN_CI.toBoolean() }
-                expression { params.RUN_CPPCHECK.toBoolean() }
            }
            parallel{
                stage('Clang Format and Cppcheck') {
@@ -1489,7 +1496,7 @@ pipeline {
                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
-                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8" \
+                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
                                           ninja -j64 benchmark_gemm_all && \
@@ -1529,7 +1536,7 @@ pipeline {
                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
-                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8" \
+                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
                                           ninja -j64 benchmark_gemm_all && \
@@ -1571,11 +1578,7 @@ pipeline {
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
                                           ninja -j64 benchmark_gemm_all && \
                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
-                                           --warmup 5 --repeat 5 --verbose --json results.json && \
-                                           ninja -j64 benchmark_gemm_fp16_rcr && \
-                                           ninja -j64 benchmark_gemm_fp16_rrr && \
-                                           ninja -j64 benchmark_gemm_fp16_crr && \
-                                           ninja -j64 benchmark_gemm_fp16_ccr """
+                                           --warmup 5 --repeat 5 --verbose --json results.json """
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1854,4 +1857,36 @@ pipeline {
            }
        }
    }
+    post {
+        failure {
+            node(rocmnode("nogpu")) {
+                script {
+                    // Get the build log.
+                    def buildLog = sh(script: 'wget -q --no-check-certificate -O - ' + BUILD_URL + 'consoleText', returnStdout: true)
+                    // Check for patterns in the log.
+                    def foundPatterns = []
+                    for (patternMap in failurePatterns) {
+                        def result = checkForPattern(patternMap.pattern, buildLog)
+                        if (result.found) {
+                            foundPatterns.add([
+                                description: patternMap.description,
+                                matchedLine: result.matchedLine,
+                                context: result.context
+                            ])
+                        }
+                    }
+                    // Send a notification for each matched failure pattern.
+                    for (patternMap in foundPatterns) {
+                        withCredentials([string(credentialsId: 'ck_ci_errors_webhook_url', variable: 'WEBHOOK_URL')]) {
+                        sh '''
+                            curl -X POST "${WEBHOOK_URL}" \
+                            -H 'Content-Type: application/json' \
+                            -d '{"text": "\\n\\n**Build Failed**\\n\\n**Issues detected:** ''' + patternMap.description + '''\\n\\n**Log context:**\\n```\\n''' + patternMap.context.replace("'", "\\'") + '''\\n```\\n\\n**Job:** ''' + env.JOB_NAME + '''\\n\\n**Build:** #''' + env.BUILD_NUMBER + '''\\n\\n**URL:** ''' + env.RUN_DISPLAY_URL + '''"}'
+                        '''
+                        }
+                    }                    
+                }
+            }
+        }
+    }
 }
--- a/client_example/01_gemm/README.md
+++ b/client_example/01_gemm/README.md
@@ -1,5 +1,22 @@
-[Back to supported operations](../../../include/ck/README.md)
-# Composable Kernel GEMM
+# Client Example: Basic GEMM
+
+## Theory
+
+This client example demonstrates a basic **GEMM (General Matrix Multiplication)** operation using the Composable Kernel library. GEMM is a core operation in linear algebra and deep learning, computing the product of two matrices and optionally adding a bias or scaling.
+
+**Mathematical Formulation:**
+$$
+C = \alpha (A \times B) + \beta D
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $D$: [M, N] optional bias or residual
+- $C$: [M, N] output
+- $\alpha, \beta$: scalars (often 1.0, 0.0)
+
+**Algorithmic Background:**
+- The operation is implemented using a tiled/blocking strategy for memory efficiency.
+- GEMM is the computational backbone for transformer attention, MLPs, and CNNs (via im2col).

 ## GEMM
 General matrix multiplications operation. In CK GEMM operation is called as `DeviceGemm` and requires following types as template parameters:
@@ -124,3 +141,38 @@ Table of supported cases by instance factory with XDL instruction for Row/Row/Ro
 * **DeviceGemmReduce** - GEMM fused with reduction.
 * **DeviceGemm_Streamk_V2** - GEMM stream K implementation. Implementation allows to use reduction instead of AtomicAdd.
 * **DeviceGemmStreamK** - GEMM stream K implementation using AtomicAdd.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/01_gemm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/01_gemm/
+├── gemm.cpp         # Main client example: sets up, runs, and verifies GEMM
+├── CMakeLists.txt   # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `gemm.cpp`):  
+  Sets up input matrices, configures GEMM parameters, launches the GEMM kernel, and verifies the result.
+- **GEMM kernel invocation**:  
+  Uses the Composable Kernel device API to launch the GEMM operation.
+
+This client example provides a minimal, end-to-end demonstration of using Composable Kernel for matrix multiplication in a user application.
--- a/client_example/02_gemm_add_add_fastgelu/README.md
+++ b/client_example/02_gemm_add_add_fastgelu/README.md
@@ -0,0 +1,65 @@
+# Client Example: GEMM with Add, Add, and FastGELU Fusion
+
+## Theory
+
+This client example demonstrates **GEMM fused with two addition operations and FastGELU activation**. This pattern is common in transformer feed-forward networks and other neural architectures where a linear transformation is followed by bias addition, residual addition, and a non-linear activation.
+
+**Mathematical Formulation:**
+$$
+E = \text{FastGELU}((A \times B) + D_0 + D_1)
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $D_0$: [N] bias vector (broadcasted)
+- $D_1$: [M, N] residual tensor
+- $E$: [M, N] output
+
+FastGELU is an efficient approximation of GELU:
+$$
+\text{FastGELU}(x) = x \cdot \sigma(1.702 \cdot x)
+$$
+where $\sigma$ is the sigmoid function.
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, bias and residual are added, and FastGELU is applied before writing to global memory.
+- No intermediate results are written to global memory.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/02_gemm_add_add_fastgelu
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_add_add_fastgelu
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/02_gemm_add_add_fastgelu/
+├── gemm_add_add_fastgelu.cpp         # Main client example: GEMM+Add+Add+FastGELU
+├── gemm_add_add_fastgelu_generic.cpp # Generic variant
+├── gemm_add_fastgelu.cpp             # GEMM+Add+FastGELU
+├── gemm_add_fastgelu_generic.cpp     # Generic variant
+├── gemm_fastgelu.cpp                 # GEMM+FastGELU only
+├── gemm_fastgelu_generic.cpp         # Generic variant
+├── CMakeLists.txt                    # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input matrices, configures GEMM and epilogue parameters, launches the fused kernel, and verifies the result.
+- **Fused kernel invocation**:  
+  Uses the Composable Kernel device API to launch the GEMM with fused addition and FastGELU.
+
+This client example provides several variants to demonstrate different levels of fusion and genericity for transformer-style MLP layers.
--- a/client_example/03_gemm_layernorm/README.md
+++ b/client_example/03_gemm_layernorm/README.md
@@ -0,0 +1,57 @@
+# Client Example: GEMM with LayerNorm Fusion
+
+## Theory
+
+This client example demonstrates **GEMM fused with layer normalization** and additional elementwise operations. This pattern is common in transformer feed-forward networks and other architectures where a linear transformation is followed by normalization and activation.
+
+**Mathematical Formulation:**
+- GEMM: $Y = A \times B$
+- Additions: $Z = Y + D_0 + D_1$ (bias, residual, etc.)
+- Activation: $A = \text{ReLU}(Z)$ (or other activation)
+- LayerNorm: $\text{LayerNorm}(A) = \gamma \cdot \frac{A - \mu}{\sqrt{\sigma^2 + \epsilon}} + \beta$
+
+$\mu$, $\sigma^2$ are mean and variance over the normalization axis; $\gamma$, $\beta$ are learnable scale and shift.
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, elementwise ops and layer normalization are fused in the epilogue.
+- LayerNorm is typically applied over the last dimension (features).
+- This fusion reduces memory traffic and is common in transformer MLP blocks.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/03_gemm_layernorm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (naive)
+./gemm_add_add_layernorm_naive
+
+# Example run (with ReLU and Welford)
+./gemm_add_relu_add_layernorm_welford
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/03_gemm_layernorm/
+├── gemm_add_add_layernorm_naive.cpp         # GEMM + Add + Add + LayerNorm (naive)
+├── gemm_add_relu_add_layernorm_welford.cpp  # GEMM + Add + ReLU + Add + LayerNorm (Welford)
+├── CMakeLists.txt                           # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input matrices, configures GEMM and epilogue parameters, launches the fused kernel, and verifies the result.
+- **LayerNorm implementation**:  
+  Demonstrates both naive and numerically stable (Welford) algorithms for mean/variance.
+
+This client example provides variants to demonstrate different levels of fusion and normalization for transformer-style MLP layers.
--- a/client_example/04_contraction/README.md
+++ b/client_example/04_contraction/README.md
@@ -0,0 +1,56 @@
+# Client Example: General Tensor Contraction
+
+## Theory
+
+This client example demonstrates **general tensor contraction** operations, including bilinear and scaled contractions. Tensor contraction generalizes matrix multiplication to higher dimensions and is used in scientific computing, quantum chemistry, and advanced neural network layers.
+
+**Mathematical Formulation:**
+- General contraction: $C_{i,j} = \sum_k A_{i,k} \cdot B_{k,j}$
+- Bilinear contraction: $C = \alpha (A \cdot B) + \beta D$
+- Scale contraction: $C = \text{scale}(A, B)$ (elementwise or broadcasted scaling)
+
+**Algorithmic Background:**
+- Contraction can be performed over arbitrary axes and supports broadcasting.
+- Bilinear and scale contractions are used for feature fusion, gating, and scientific workloads.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/04_contraction
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (bilinear FP32)
+./contraction_bilinear_fp32
+
+# Example run (scale FP64)
+./contraction_scale_fp64
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/04_contraction/
+├── contraction_bilinear_fp32.cpp         # Bilinear contraction (FP32)
+├── contraction_bilinear_fp64.cpp         # Bilinear contraction (FP64)
+├── contraction_g1m2n3k1_add_xdl_fp16.cpp # Grouped contraction with addition (FP16)
+├── contraction_scale_fp32.cpp            # Scale contraction (FP32)
+├── contraction_scale_fp64.cpp            # Scale contraction (FP64)
+├── CMakeLists.txt                        # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input tensors, configures contraction parameters, launches the contraction kernel, and verifies the result.
+- **Contraction kernel invocation**:  
+  Uses the Composable Kernel device API to launch the contraction operation.
+
+This client example provides several variants to demonstrate different contraction types and data types for scientific and ML workloads.
--- a/client_example/05_layernorm/README.md
+++ b/client_example/05_layernorm/README.md
@@ -0,0 +1,66 @@
+# Client Example: Layer Normalization (Forward and Backward)
+
+## Theory
+
+This client example demonstrates **layer normalization** in both forward and backward modes, for 2D and 4D tensors. Layer normalization is used in transformers and other neural networks to normalize activations across the feature dimension, improving training stability.
+
+**Mathematical Formulation:**
+Given input $X$:
+- Mean: $\mu = \frac{1}{N} \sum_{i=1}^N X_i$
+- Variance: $\sigma^2 = \frac{1}{N} \sum_{i=1}^N (X_i - \mu)^2$
+- Normalized: $\hat{X}_i = \frac{X_i - \mu}{\sqrt{\sigma^2 + \epsilon}}$
+- Output: $Y_i = \gamma \hat{X}_i + \beta$
+
+$\gamma$, $\beta$ are learnable scale and shift parameters.
+
+**Algorithmic Background:**
+- Forward pass computes mean, variance, normalization, and affine transformation.
+- Backward pass computes gradients with respect to input, gamma, and beta.
+- Supports both 2D (batch, feature) and 4D (batch, channel, height, width) tensors.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/05_layernorm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (2D forward)
+./layernorm2d_fwd
+
+# Example run (4D forward)
+./layernorm4d_fwd
+
+# Example run (2D backward, data)
+./layernorm2d_bwd_data
+
+# Example run (2D backward, gamma/beta)
+./layernorm2d_bwd_gamma_beta
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/05_layernorm/
+├── layernorm2d_fwd.cpp         # 2D layernorm forward
+├── layernorm4d_fwd.cpp         # 4D layernorm forward
+├── layernorm2d_bwd_data.cpp    # 2D layernorm backward (data)
+├── layernorm2d_bwd_gamma_beta.cpp # 2D layernorm backward (gamma/beta)
+├── CMakeLists.txt              # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input tensors, configures normalization parameters, launches the forward or backward kernel, and verifies the result.
+- **LayerNorm implementation**:  
+  Demonstrates both forward and backward passes for different tensor shapes.
+
+This client example provides a comprehensive demonstration of layer normalization for both inference and training in deep learning models.
--- a/client_example/06_softmax/README.md
+++ b/client_example/06_softmax/README.md
@@ -0,0 +1,54 @@
+# Client Example: 4D Softmax
+
+## Theory
+
+This client example demonstrates **Softmax computation over 4D tensors**. Softmax is a key operation in deep learning, especially in attention mechanisms and classification, converting logits into normalized probabilities.
+
+**Mathematical Formulation:**
+Given input $X$ and axis $a$:
+$$
+\text{softmax}(X)_i = \frac{\exp(X_i)}{\sum_j \exp(X_j)}
+$$
+
+**Algorithmic Background:**
+- Softmax is implemented using a numerically stable algorithm:
+  1. Subtract the maximum value for numerical stability.
+  2. Exponentiate and sum.
+  3. Normalize by the sum.
+- Efficient parallel Softmax requires careful reduction and memory access patterns.
+- This example demonstrates Softmax over a 4D tensor, as used in attention and vision models.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/06_softmax
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./softmax4d
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/06_softmax/
+├── softmax4d.cpp         # Main client example: sets up, runs, and verifies 4D softmax
+├── CMakeLists.txt        # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `softmax4d.cpp`):  
+  Sets up input tensors, configures Softmax parameters, launches the Softmax kernel, and verifies the result.
+- **Softmax kernel invocation**:  
+  Uses the Composable Kernel device API to launch the Softmax operation.
+
+This client example provides a demonstration of efficient, numerically stable Softmax for 4D tensors in deep learning models.
--- a/client_example/07_grouped_convnd_fwd/README.md
+++ b/client_example/07_grouped_convnd_fwd/README.md
@@ -1,5 +1,18 @@
-[Back to supported operations](../../../include/ck/README.md)
-# Composable Kernel Grouped Convolution
+# Client Example: Grouped N-Dimensional Convolution Forward
+
+## Theory
+
+This client example demonstrates **grouped N-dimensional convolution forward** for 1D, 2D, and 3D inputs, supporting multiple data types (including BF8 and FP8). Grouped convolution is used in modern CNNs and vision transformers to reduce computation and enable channel-wise or expert-wise processing.
+
+**Mathematical Formulation:**
+Given input $X$ and weights $W$ for $G$ groups:
+- For each group $g$:
+  $$
+  Y^g[n, c_{out}, ...] = \sum_{c_{in}} \sum_{k_1} ... \sum_{k_n} X^g[n, c_{in}, ...] \cdot W^g[c_{out}, c_{in}, ...]
+  $$
+- Each group operates on a subset of input/output channels.
+
+**Algorithmic Background:**

 ## Grouped Convolution Forward
 Grouped convolution operation for 1D, 2D or 3D spatial dimensions. Convolution utilizes GEMM kernel after tensor coordinate transform. In CK Grouped Convolution Forward operation is called as `DeviceGroupedConvFwdMultipleABD` and requires following types as template parameters:
@@ -66,3 +79,52 @@ Table of supported cases by instance factory with fused elementwise operation:
 * **Scale** - 3D, NHWGC, bf16/fp16/fp32/int8
 * **Scale + Add (for A and B)** - 3D, NHWGC, bf16/fp16/fp32/int8
 * **Scale + Add + Scale + Add + Relu** - 3D, NHWGC, bf16/fp16/fp32/int8
+
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/07_grouped_convnd_fwd
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (2D grouped convolution)
+./grouped_conv2d_fwd
+
+# Example run (3D grouped convolution, BF8)
+./grouped_conv3d_fwd_bf8
+
+# Example run (3D grouped convolution, FP8)
+./grouped_conv3d_fwd_fp8
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/07_grouped_convnd_fwd/
+├── grouped_conv1d_fwd.cpp         # 1D grouped convolution
+├── grouped_conv2d_fwd.cpp         # 2D grouped convolution (NCHW)
+├── grouped_conv2d_fwd_ngchw.cpp   # 2D grouped convolution (NGCHW)
+├── grouped_conv3d_fwd_bf8.cpp     # 3D grouped convolution (BF8)
+├── grouped_conv3d_fwd_fp8.cpp     # 3D grouped convolution (FP8)
+├── grouped_conv3d_fwd_bf8_fp8.cpp # 3D grouped convolution (BF8/FP8 mixed)
+├── grouped_conv3d_fwd_fp8_bf8.cpp # 3D grouped convolution (FP8/BF8 mixed)
+├── common.hpp                     # Common utilities for grouped convolution
+├── CMakeLists.txt                 # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input tensors, configures grouped convolution parameters, launches the kernel, and verifies the result.
+- **Grouped convolution kernel invocation**:  
+  Uses the Composable Kernel device API to launch grouped convolution for different dimensions and data types.
+
+This client example provides a comprehensive demonstration of grouped convolution for efficient CNN and vision transformer models.
--- a/client_example/08_fused_attention/README.md
+++ b/client_example/08_fused_attention/README.md
@@ -0,0 +1,89 @@
+# Fused Attention Examples
+
+This directory contains comprehensive examples demonstrating CK's high-performance fused attention implementations, which are critical for modern transformer architectures and large language models.
+
+---
+
+## Theory
+
+**Fused Multi-Head Attention Operation:**
+The fused attention mechanism performs the core transformer operation in a single, optimized kernel:
+
+$$
+\text{Attention}(Q, K, V) = \text{Softmax}(Q K^T / \sqrt{d_k}) V
+$$
+
+**Detailed Mathematical Steps:**
+1. **Query-Key Attention Scores**: $S = Q K^T$
+2. **Scale**: $S_{\text{scaled}} = S / \sqrt{d_k}$
+3. **Softmax**: $A = \text{Softmax}(S_{\text{scaled}})$
+4. **Weighted Value Sum**: $\text{Output} = A V$
+
+- Multi-head extension: Each head computes attention independently, then results are concatenated and projected.
+- Tensor shapes: Q, K, V, Output are typically [Batch, Seq_len, Num_heads, Head_dim].
+
+**Algorithmic Background:**
+- Fused attention combines two GEMMs and a softmax in a single kernel, minimizing memory traffic.
+- Supports bias, masking, and permutation for transformer and LLM workloads.
+
+---
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/08_fused_attention
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (basic fused attention)
+./fused_attention
+
+# Example run (fused attention with bias)
+./fused_attention_bias
+```
+
+---
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/08_fused_attention/
+├── fused_attention.cpp         # Main client example: fused attention (Q, K, V)
+├── fused_attention_bias.cpp    # Fused attention with bias
+├── CMakeLists.txt              # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up Q, K, V tensors, configures attention parameters, launches the fused kernel, and verifies the result.
+- **Fused attention kernel invocation**:  
+  Uses the Composable Kernel device API to launch the fused attention operation, optionally with bias.
+
+---
+
+## Additional Details
+
+- Supports FP16, BF16, FP32, and mixed precision.
+- Handles causal and generic masking for autoregressive and variable-length models.
+- Optimized for memory efficiency (no intermediate attention matrix in global memory).
+- Example parameters can be adjusted in the source for different transformer workloads.
+
+---
+
+## Related Examples
+
+- [01_gemm](../01_gemm/README.md): GEMM for Q×K^T and Attn×V
+- [06_softmax](../06_softmax/README.md): Softmax client API usage
+- [03_gemm_layernorm](../03_gemm_layernorm/README.md): Fused GEMM + layer normalization
+- [07_grouped_convnd_fwd](../07_grouped_convnd_fwd/README.md): Grouped convolution for vision transformers
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/09_quantization/README.md
+++ b/client_example/09_quantization/README.md
@@ -0,0 +1,85 @@
+# Client Example: Quantization for GEMM and Conv2D
+
+## Theory
+
+This client example demonstrates **quantized GEMM and 2D convolution** operations, including per-layer and per-channel quantization, and fusion with bias and activation functions. Quantization reduces memory and computation by representing values with lower-precision integer types (e.g., int8), enabling efficient inference in deep learning.
+
+**Mathematical Formulation:**
+- Quantized GEMM: $C = \text{dequant}(A_q) \times \text{dequant}(B_q)$
+- Quantized Conv2D: $Y = \text{dequant}(X_q) * \text{dequant}(W_q)$
+- $\text{dequant}(x_q) = (x_q - z) \cdot s$ (scale $s$, zero-point $z$)
+- Per-layer: one scale/zero-point per tensor
+- Per-channel: scale/zero-point per output channel
+
+**Algorithmic Background:**
+- Quantized values are dequantized on-the-fly during computation.
+- Accumulation is performed in higher precision for accuracy.
+- Supports bias addition and activation fusion (ReLU, Tanh).
+- Per-channel quantization improves accuracy for convolutional layers.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/09_quantization
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (GEMM quantization)
+./gemm_quantization
+
+# Example run (Conv2D per-layer quantization)
+./conv2d_fwd_perlayer_quantization
+
+# Example run (Conv2D per-channel quantization)
+./conv2d_fwd_perchannel_quantization
+
+# Example run (Conv2D + bias + ReLU + per-channel quantization)
+./conv2d_fwd_bias_relu_perchannel_quantization
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/09_quantization/
+├── gemm_quantization.cpp                         # Quantized GEMM
+├── conv2d_fwd_perlayer_quantization.cpp          # Conv2D per-layer quantization
+├── conv2d_fwd_perchannel_quantization.cpp        # Conv2D per-channel quantization
+├── conv2d_fwd_bias_relu_perlayer_quantization.cpp # Conv2D + bias + ReLU + per-layer quantization
+├── conv2d_fwd_bias_relu_perchannel_quantization.cpp # Conv2D + bias + ReLU + per-channel quantization
+├── conv2d_fwd_bias_tanh_perlayer_quantization.cpp # Conv2D + bias + Tanh + per-layer quantization
+├── conv2d_fwd_bias_tanh_perchannel_quantization.cpp # Conv2D + bias + Tanh + per-channel quantization
+├── CMakeLists.txt                                # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input tensors, configures quantization parameters, launches the quantized kernel, and verifies the result.
+- **Quantization kernel invocation**:  
+  Uses the Composable Kernel device API to launch quantized GEMM or Conv2D with optional bias and activation.
+
+---
+
+## Additional Details
+
+- Supports int8 quantization, per-layer and per-channel scaling.
+- Demonstrates fusion with bias and activation (ReLU, Tanh).
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [01_gemm](../01_gemm/README.md): GEMM for quantized matrix multiplication
+- [14_gemm_quantization](../../example/14_gemm_quantization/README.md): GEMM quantization in the main example directory
+- [40_conv2d_fwd_quantization](../../example/40_conv2d_fwd_quantization/README.md): Conv2D quantization in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/10_grouped_convnd_bwd_data/README.md
+++ b/client_example/10_grouped_convnd_bwd_data/README.md
@@ -1,4 +1,4 @@
-[Back to supported operations](../../../include/ck/README.md)
+[Back to supported operations](../../include/ck/README.md)
 # Composable Kernel Grouped Convolution

 ## Grouped Convolution Backward Data
@@ -46,3 +46,56 @@ Table of supported cases by instance factory with fused elementwise operation:

 * **Bilinear** - 3D, NHWGC, bf16/fp16/fp32
 * **Scale** - 3D, NHWGC, bf16/fp16/fp32
+
+---
+
+## Theory
+
+**Grouped convolution backward data** computes the gradient of the input tensor with respect to the loss, given the output gradient and the weights, for each group independently. This is essential for training CNNs and grouped/expert models.
+
+**Mathematical Formulation:**
+For each group $g$:
+$$
+\text{InputGrad}^g = \text{ConvBwdData}(\text{OutputGrad}^g, \text{Weights}^g)
+$$
+
+- Supports 1D, 2D, and 3D grouped convolutions.
+- Utilizes implicit GEMM for efficient computation.
+- Supports fused elementwise operations (e.g., bilinear, scale).
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/10_grouped_convnd_bwd_data
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (2D grouped convolution backward data)
+./grouped_conv2d_bwd_data
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/10_grouped_convnd_bwd_data/
+├── grouped_conv1d_bwd_data.cpp         # 1D grouped convolution backward data
+├── grouped_conv2d_bwd_data.cpp         # 2D grouped convolution backward data
+├── grouped_conv3d_bwd_data.cpp         # 3D grouped convolution backward data
+├── CMakeLists.txt                      # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input/output tensors, configures grouped convolution parameters, launches the backward data kernel, and verifies the result.
+- **Grouped convolution backward kernel invocation**:  
+  Uses the Composable Kernel device API to launch grouped convolution backward data for different dimensions and data types.
+
+This client example provides a comprehensive demonstration of grouped convolution backward data for efficient CNN and vision transformer training.
--- a/client_example/11_grouped_conv_bwd_weight/README.md
+++ b/client_example/11_grouped_conv_bwd_weight/README.md
@@ -1,4 +1,4 @@
-[Back to supported operations](../../../include/ck/README.md)
+[Back to supported operations](../../include/ck/README.md)
 # Composable Kernel Grouped Convolution

 ## Grouped Convolution Backward Weight
@@ -60,3 +60,63 @@ Table of supported cases by instance factory with fused elementwise operation:

 * **Bilinear** - 3D, NHWGC, bf16(fp32 for weight)/fp16/fp32
 * **Scale** - 3D, NHWGC, bf16(fp32 for weight)/fp16/fp32
+
+---
+
+## Theory
+
+**Grouped convolution backward weight** computes the gradient of the weights with respect to the loss, given the input and output gradients, for each group independently. This is essential for training CNNs and grouped/expert models.
+
+**Mathematical Formulation:**
+For each group $g$:
+$$
+\text{WeightGrad}^g = \text{ConvBwdWeight}(\text{Input}^g, \text{OutputGrad}^g)
+$$
+
+- Supports 1D, 2D, and 3D grouped convolutions.
+- Utilizes implicit GEMM for efficient computation.
+- Supports fused elementwise operations (e.g., bilinear, scale).
+- Uses splitK for large GEMM K dimensions.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/11_grouped_conv_bwd_weight
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (2D grouped convolution backward weight, FP16)
+./grouped_conv2d_bwd_weight_fp16
+
+# Example run (3D grouped convolution backward weight, FP32)
+./grouped_conv3d_bwd_weight_fp32
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/11_grouped_conv_bwd_weight/
+├── grouped_conv1d_bwd_weight_fp16.cpp         # 1D grouped convolution backward weight (FP16)
+├── grouped_conv2d_bwd_weight_fp16.cpp         # 2D grouped convolution backward weight (FP16)
+├── grouped_conv3d_bwd_weight_fp16.cpp         # 3D grouped convolution backward weight (FP16)
+├── grouped_conv3d_bwd_weight_fp32.cpp         # 3D grouped convolution backward weight (FP32)
+├── grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp # 3D grouped convolution backward weight (FP16, BF8/FP8 mixed)
+├── common.hpp                                 # Common utilities for grouped convolution
+├── CMakeLists.txt                             # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input/output tensors, configures grouped convolution parameters, launches the backward weight kernel, and verifies the result.
+- **Grouped convolution backward weight kernel invocation**:  
+  Uses the Composable Kernel device API to launch grouped convolution backward weight for different dimensions and data types.
+
+This client example provides a comprehensive demonstration of grouped convolution backward weight for efficient CNN and vision transformer training.
--- a/client_example/12_elementwise_normalization/README.md
+++ b/client_example/12_elementwise_normalization/README.md
@@ -0,0 +1,69 @@
+# Client Example: Elementwise Layer Normalization
+
+## Theory
+
+This client example demonstrates **elementwise layer normalization** for 2D tensors. Layer normalization is used in transformers and other neural networks to normalize activations across the feature dimension, improving training stability. Elementwise normalization fuses normalization with other elementwise operations for efficiency.
+
+**Mathematical Formulation:**
+Given input $X$:
+- Mean: $\mu = \frac{1}{N} \sum_{i=1}^N X_i$
+- Variance: $\sigma^2 = \frac{1}{N} \sum_{i=1}^N (X_i - \mu)^2$
+- Normalized: $\hat{X}_i = \frac{X_i - \mu}{\sqrt{\sigma^2 + \epsilon}}$
+- Output: $Y_i = \gamma \hat{X}_i + \beta$
+
+$\gamma$, $\beta$ are learnable scale and shift parameters.
+
+**Algorithmic Background:**
+- Computes mean and variance per row (sample).
+- Applies normalization and affine transformation.
+- Can be fused with other elementwise operations for efficiency.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/12_elementwise_normalization
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./elementwise_layernorm2d
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/12_elementwise_normalization/
+├── elementwise_layernorm2d.cpp         # Main client example: elementwise layernorm for 2D tensors
+├── CMakeLists.txt                      # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `elementwise_layernorm2d.cpp`):  
+  Sets up input tensors, configures normalization parameters, launches the normalization kernel, and verifies the result.
+- **Elementwise normalization kernel invocation**:  
+  Uses the Composable Kernel device API to launch layer normalization, optionally fused with other elementwise ops.
+
+---
+
+## Additional Details
+
+- Supports fusion with other elementwise operations for efficiency.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [05_layernorm](../05_layernorm/README.md): Layer normalization client API
+- [27_layernorm2d_fwd](../../example/27_layernorm2d_fwd/README.md): Layer normalization in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/13_batchnorm/README.md
+++ b/client_example/13_batchnorm/README.md
@@ -0,0 +1,76 @@
+# Client Example: Batch Normalization (Forward, Backward, Inference)
+
+## Theory
+
+This client example demonstrates **batch normalization** in forward, backward, and inference modes for NHWC tensors. Batch normalization is used in deep neural networks to normalize activations across the batch and spatial dimensions, improving training stability and convergence.
+
+**Mathematical Formulation:**
+Given input $X[N, H, W, C]$:
+- Mean: $\mu_c = \frac{1}{NHW} \sum_{n,h,w} X_{n,h,w,c}$
+- Variance: $\sigma^2_c = \frac{1}{NHW} \sum_{n,h,w} (X_{n,h,w,c} - \mu_c)^2$
+- Normalized: $\hat{X}_{n,h,w,c} = \frac{X_{n,h,w,c} - \mu_c}{\sqrt{\sigma^2_c + \epsilon}}$
+- Output: $Y_{n,h,w,c} = \gamma_c \hat{X}_{n,h,w,c} + \beta_c$
+
+$\gamma_c$, $\beta_c$ are learnable scale and shift parameters per channel.
+
+**Algorithmic Background:**
+- Forward pass computes mean, variance, normalization, and affine transformation.
+- Backward pass computes gradients with respect to input, gamma, and beta.
+- Inference uses running mean and variance for normalization.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/13_batchnorm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (forward)
+./batchnorm_fwd_nhwc
+
+# Example run (backward)
+./batchnorm_bwd_nhwc
+
+# Example run (inference)
+./batchnorm_infer_nhwc
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/13_batchnorm/
+├── batchnorm_fwd_nhwc.cpp         # Batchnorm forward (NHWC)
+├── batchnorm_bwd_nhwc.cpp         # Batchnorm backward (NHWC)
+├── batchnorm_infer_nhwc.cpp       # Batchnorm inference (NHWC)
+├── CMakeLists.txt                 # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input tensors, configures batchnorm parameters, launches the forward, backward, or inference kernel, and verifies the result.
+- **BatchNorm kernel invocation**:  
+  Uses the Composable Kernel device API to launch batch normalization for different modes.
+
+---
+
+## Additional Details
+
+- Supports NHWC layout for image and vision models.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [34_batchnorm](../../example/34_batchnorm/README.md): Batch normalization in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/14_instance_id/README.md
+++ b/client_example/14_instance_id/README.md
@@ -0,0 +1,63 @@
+# Client Example: BatchNorm with Instance ID Selection
+
+## Theory
+
+This client example demonstrates **batch normalization** using explicit instance ID selection. In Composable Kernel, "instance ID" refers to a specific kernel configuration (tile sizes, vectorization, etc.) chosen for a given workload. This allows users to benchmark or select the best-performing kernel for their data shape.
+
+**Mathematical Formulation:**
+See [BatchNorm Theory](../13_batchnorm/README.md) for the mathematical details of batch normalization.
+
+**Algorithmic Background:**
+- The example shows how to enumerate and select a specific kernel instance by its ID.
+- Useful for performance tuning, benchmarking, and debugging.
+- BatchNorm is performed in NHWC layout.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/14_instance_id
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (selects a specific kernel instance)
+./batchnorm_fwd_instance_id
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/14_instance_id/
+├── batchnorm_fwd_instance_id.cpp         # Batchnorm forward with instance ID selection
+├── CMakeLists.txt                        # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `batchnorm_fwd_instance_id.cpp`):  
+  Sets up input tensors, enumerates available kernel instances, selects an instance by ID, launches the batchnorm kernel, and verifies the result.
+- **Instance selection**:  
+  Demonstrates how to use the Composable Kernel API to list and select kernel configurations.
+
+---
+
+## Additional Details
+
+- Useful for kernel benchmarking and performance tuning.
+- Example parameters and instance ID can be adjusted in the source.
+
+---
+
+## Related Examples
+
+- [13_batchnorm](../13_batchnorm/README.md): Batch normalization client API
+- [34_batchnorm](../../example/34_batchnorm/README.md): Batch normalization in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/15_convnd_bwd_data/README.md
+++ b/client_example/15_convnd_bwd_data/README.md
@@ -0,0 +1,73 @@
+# Client Example: N-Dimensional Convolution Backward Data
+
+## Theory
+
+This client example demonstrates **N-dimensional convolution backward data** for 3D inputs, supporting multiple data types (FP16, FP32). The backward data operation computes the gradient of the input tensor with respect to the loss, given the output gradient and the weights. This is essential for training CNNs and 3D vision models.
+
+**Mathematical Formulation:**
+For input $X$, weights $W$, and output gradient $dY$:
+$$
+dX = \text{ConvBwdData}(dY, W)
+$$
+
+- Supports 3D convolution (ND can be extended).
+- Utilizes implicit GEMM for efficient computation.
+
+**Algorithmic Background:**
+- The backward data operation is implemented as a convolution with transformed coordinates.
+- Used in training pipelines for 3D CNNs, medical imaging, and volumetric data.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/15_convnd_bwd_data
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (3D backward data, FP16)
+./conv3d_bwd_data_fp16
+
+# Example run (3D backward data, FP32)
+./conv3d_bwd_data_fp32
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/15_convnd_bwd_data/
+├── conv3d_bwd_data_fp16.cpp         # 3D convolution backward data (FP16)
+├── conv3d_bwd_data_fp32.cpp         # 3D convolution backward data (FP32)
+├── common.hpp                       # Common utilities for convolution
+├── CMakeLists.txt                   # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input/output tensors, configures convolution parameters, launches the backward data kernel, and verifies the result.
+- **Backward data kernel invocation**:  
+  Uses the Composable Kernel device API to launch convolution backward data for different data types.
+
+---
+
+## Additional Details
+
+- Supports FP16 and FP32 for 3D convolution.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [10_grouped_convnd_bwd_data](../10_grouped_convnd_bwd_data/README.md): Grouped convolution backward data
+- [17_convnd_bwd_data](../../example/17_convnd_bwd_data/README.md): Convolution backward data in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/16_convnd_fwd/README.md
+++ b/client_example/16_convnd_fwd/README.md
@@ -0,0 +1,85 @@
+# Client Example: N-Dimensional Convolution Forward
+
+## Theory
+
+This client example demonstrates **N-dimensional convolution forward** for 3D inputs, supporting multiple data types (FP16, FP32, FP8 composite). Convolution is a fundamental operation in deep learning, especially in convolutional neural networks (CNNs) for images, audio, and volumetric data.
+
+**Mathematical Formulation:**
+Given input $X$, weights $W$:
+$$
+Y = \text{Conv}(X, W)
+$$
+
+- Supports 3D convolution (ND can be extended).
+- Utilizes implicit GEMM for efficient computation.
+
+**Algorithmic Background:**
+- The forward convolution operation is implemented as a convolution with transformed coordinates.
+- Used in inference and training pipelines for 3D CNNs, medical imaging, and volumetric data.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/16_convnd_fwd
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (3D forward, FP16)
+./conv3d_fwd_fp16
+
+# Example run (3D forward, FP32)
+./conv3d_fwd_fp32
+
+# Example run (3D forward, FP16 compute with FP8)
+./conv3d_fwd_fp16_comp_fp8
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/16_convnd_fwd/
+├── conv3d_fwd_fp16.cpp         # 3D convolution forward (FP16)
+├── conv3d_fwd_fp32.cpp         # 3D convolution forward (FP32)
+├── conv3d_fwd_fp16_comp_fp8.cpp # 3D convolution forward (FP16 compute, FP8)
+├── common.hpp                  # Common utilities for convolution
+├── CMakeLists.txt              # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input/output tensors, configures convolution parameters, launches the forward kernel, and verifies the result.
+- **Forward convolution kernel invocation**:  
+  Uses the Composable Kernel device API to launch convolution forward for different data types.
+
+---
+
+## Additional Details
+
+- Supports FP16, FP32, and FP8 composite for 3D convolution.
+- Parameters can be adjusted in the source files for different workloads. The following parameters are configurable:
+  - `NumDimSpatial`: Number of spatial dimensions (default: 3 for 3D convolution)
+  - `G`: Number of groups (default: 1)
+  - `N`: Batch size (default: 64)
+  - `K`: Number of output channels (default: 128)
+  - `C`: Number of input channels (default: 64)
+  - `Z`, `Y`, `X`: Filter/kernel dimensions (default: 3x3x3)
+  - `Di`, `Hi`, `Wi`: Input dimensions - depth, height, width (default: 28x28x3)
+  - `Do`, `Ho`, `Wo`: Output dimensions - depth, height, width (default: 28x28x3)
+
+---
+
+## Related Examples
+
+- [09_convnd_fwd](../../example/09_convnd_fwd/README.md): N-dimensional convolution in the main example directory
+- [30_grouped_conv_fwd_multiple_d](../../example/30_grouped_conv_fwd_multiple_d/README.md): Grouped convolution forward with multiple D
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/17_grouped_gemm_fastgelu/README.md
+++ b/client_example/17_grouped_gemm_fastgelu/README.md
@@ -0,0 +1,71 @@
+# Client Example: Grouped GEMM with FastGELU Activation
+
+## Theory
+
+This client example demonstrates **grouped GEMM fused with FastGELU activation**. Grouped GEMM performs multiple independent GEMM operations (with potentially different shapes) in a single kernel launch, and FastGELU is a fast approximation of the GELU activation used in transformers and MLPs.
+
+**Mathematical Formulation:**
+For $G$ groups, each with its own $A_g$, $B_g$:
+- GEMM: $Y_g = A_g \times B_g$
+- FastGELU: $E_g = \text{FastGELU}(Y_g)$
+
+FastGELU is defined as:
+$$
+\text{FastGELU}(x) = x \cdot \sigma(1.702 \cdot x)
+$$
+where $\sigma$ is the sigmoid function.
+
+**Algorithmic Background:**
+- Each group can have different matrix sizes and strides.
+- The kernel launches a grid covering all groups, with each block assigned to a group.
+- FastGELU is applied in the epilogue for each group.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/17_grouped_gemm_fastgelu
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./grouped_gemm_fastgelu
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/17_grouped_gemm_fastgelu/
+├── grouped_gemm_fastgelu.cpp         # Main client example: grouped GEMM + FastGELU
+├── CMakeLists.txt                    # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `grouped_gemm_fastgelu.cpp`):  
+  Sets up input matrices for each group, configures GEMM and epilogue parameters, launches the grouped kernel, and verifies the result.
+- **Grouped GEMM kernel invocation**:  
+  Uses the Composable Kernel device API to launch grouped GEMM with FastGELU activation.
+
+---
+
+## Additional Details
+
+- Supports multiple groups with different matrix shapes.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [15_grouped_gemm](../../example/15_grouped_gemm/README.md): Grouped GEMM in the main example directory
+- [04_gemm_add_add_fastgelu](../../example/04_gemm_add_add_fastgelu/README.md): GEMM with FastGELU fusion
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/18_groupnorm/README.md
+++ b/client_example/18_groupnorm/README.md
@@ -0,0 +1,80 @@
+# Client Example: Group Normalization (Forward and Backward)
+
+## Theory
+
+This client example demonstrates **group normalization** in both forward and backward modes, including fusion with Swish activation. Group normalization normalizes activations across groups of channels, improving training stability for small batch sizes or non-i.i.d. data.
+
+**Mathematical Formulation:**
+Given input $X[N, C, ...]$ divided into $G$ groups:
+- For each group $g$:
+  - Mean: $\mu_g = \frac{1}{|g|} \sum_{i \in g} X_i$
+  - Variance: $\sigma^2_g = \frac{1}{|g|} \sum_{i \in g} (X_i - \mu_g)^2$
+  - Normalized: $\hat{X}_i = \frac{X_i - \mu_g}{\sqrt{\sigma^2_g + \epsilon}}$
+  - Output: $Y_i = \gamma \hat{X}_i + \beta$
+
+$\gamma$, $\beta$ are learnable scale and shift parameters.
+
+- Swish activation: $\text{Swish}(x) = x \cdot \sigma(x)$, where $\sigma$ is the sigmoid function.
+
+**Algorithmic Background:**
+- Forward pass computes mean, variance, normalization, and affine transformation per group.
+- Backward pass computes gradients with respect to input, gamma, and beta.
+- Swish activation can be fused with normalization for efficiency.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/18_groupnorm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (forward with Swish)
+./groupnorm_swish_fwd
+
+# Example run (backward, data)
+./groupnorm_bwd_data
+
+# Example run (backward, gamma/beta)
+./groupnorm_bwd_gamma_beta
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/18_groupnorm/
+├── groupnorm_swish_fwd.cpp         # Groupnorm forward with Swish activation
+├── groupnorm_bwd_data.cpp          # Groupnorm backward (data)
+├── groupnorm_bwd_gamma_beta.cpp    # Groupnorm backward (gamma/beta)
+├── CMakeLists.txt                  # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input tensors, configures groupnorm parameters, launches the forward or backward kernel, and verifies the result.
+- **GroupNorm kernel invocation**:  
+  Uses the Composable Kernel device API to launch group normalization for different modes.
+
+---
+
+## Additional Details
+
+- Supports fusion with Swish activation for efficiency.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [42_groupnorm_fwd](../../example/42_groupnorm_fwd/README.md): Group normalization in the main example directory
+- [54_groupnorm_bwd](../../example/54_groupnorm_bwd/README.md): Group normalization backward in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/19_pool/README.md
+++ b/client_example/19_pool/README.md
@@ -0,0 +1,80 @@
+# Client Example: Pooling Operations (2D Max, 3D Avg)
+
+## Theory
+
+This client example demonstrates **pooling operations** for 2D max pooling and 3D average pooling, including both forward and backward passes. Pooling is used in convolutional neural networks (CNNs) for spatial downsampling, translation invariance, and reducing computation.
+
+**Mathematical Formulation:**
+- **Max Pooling (2D):** $Y_{n,c,h,w} = \max_{i,j} X_{n,c,h \cdot s_H + i, w \cdot s_W + j}$
+- **Average Pooling (3D):** $Y_{n,c,d,h,w} = \frac{1}{k_D k_H k_W} \sum_{i,j,k} X_{n,c,d \cdot s_D + i, h \cdot s_H + j, w \cdot s_W + k}$
+
+Where $s_H, s_W, s_D$ are strides, $k_H, k_W, k_D$ are kernel sizes.
+
+**Algorithmic Background:**
+- Forward pass computes the pooled output.
+- Backward pass computes the gradient with respect to the input.
+- Handles padding and boundary conditions.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/19_pool
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (2D max pool forward)
+./max_pool2d_fwd
+
+# Example run (2D max pool backward)
+./max_pool2d_bwd
+
+# Example run (3D avg pool forward)
+./avg_pool3d_fwd
+
+# Example run (3D avg pool backward)
+./avg_pool3d_bwd
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/19_pool/
+├── max_pool2d_fwd.cpp         # 2D max pooling forward
+├── max_pool2d_bwd.cpp         # 2D max pooling backward
+├── avg_pool3d_fwd.cpp         # 3D average pooling forward
+├── avg_pool3d_bwd.cpp         # 3D average pooling backward
+├── CMakeLists.txt             # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input tensors, configures pooling parameters, launches the forward or backward kernel, and verifies the result.
+- **Pooling kernel invocation**:  
+  Uses the Composable Kernel device API to launch pooling operations for different modes.
+
+---
+
+## Additional Details
+
+- Supports both max and average pooling, forward and backward.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [13_pool2d_fwd](../../example/13_pool2d_fwd/README.md): 2D pooling in the main example directory
+- [48_pool3d_fwd](../../example/48_pool3d_fwd/README.md): 3D pooling in the main example directory
+- [49_maxpool2d_bwd](../../example/49_maxpool2d_bwd/README.md): 2D max pool backward in the main example directory
+- [51_avgpool3d_bwd](../../example/51_avgpool3d_bwd/README.md): 3D avg pool backward in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/20_splitk_gemm/README.md
+++ b/client_example/20_splitk_gemm/README.md
@@ -0,0 +1,66 @@
+# Client Example: Split-K GEMM
+
+## Theory
+
+This client example demonstrates **Split-K GEMM**, a technique for parallelizing matrix multiplication along the K dimension. Split-K is used to improve parallelism and memory bandwidth utilization for large GEMM operations, especially when K is large.
+
+**Mathematical Formulation:**
+- Standard GEMM: $C = A \times B$
+- Split-K: Partition the K dimension into $K_s$ splits, compute partial results, then reduce:
+  $$
+  C = \sum_{s=1}^{K_s} (A_{[:, K_s]} \times B_{[K_s, :]})
+  $$
+
+**Algorithmic Background:**
+- Each split computes a partial GEMM over a chunk of K.
+- Partial results are reduced (summed) to produce the final output.
+- Useful for large K, limited workspace, or maximizing GPU occupancy.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/20_splitk_gemm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (FP16 compute, FP8 output)
+./splitK_gemm_fp16_f8
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/20_splitk_gemm/
+├── splitK_gemm_fp16_f8.cpp         # Main client example: Split-K GEMM (FP16 compute, FP8 output)
+├── CMakeLists.txt                  # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `splitK_gemm_fp16_f8.cpp`):  
+  Sets up input matrices, configures Split-K parameters, launches the Split-K GEMM kernel, and verifies the result.
+- **Split-K kernel invocation**:  
+  Uses the Composable Kernel device API to launch the Split-K GEMM operation.
+
+---
+
+## Additional Details
+
+- Supports FP16 compute with FP8 output for memory efficiency.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [35_splitK_gemm](../../example/35_splitK_gemm/README.md): Split-K GEMM in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/21_grouped_gemm_bias/README.md
+++ b/client_example/21_grouped_gemm_bias/README.md
@@ -0,0 +1,65 @@
+# Client Example: Grouped GEMM with Bias
+
+## Theory
+
+This client example demonstrates **grouped GEMM fused with bias addition**. Grouped GEMM performs multiple independent GEMM operations (with potentially different shapes) in a single kernel launch, and bias addition is a standard pattern in neural network layers.
+
+**Mathematical Formulation:**
+For $G$ groups, each with its own $A_g$, $B_g$, $b_g$:
+- GEMM: $Y_g = A_g \times B_g$
+- Bias: $E_g = Y_g + b_g$
+
+**Algorithmic Background:**
+- Each group can have different matrix sizes and strides.
+- The kernel launches a grid covering all groups, with each block assigned to a group.
+- Bias is added in the epilogue for each group.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/21_grouped_gemm_bias
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (grouped GEMM with bias, FP16)
+./grouped_gemm_fixed_nk_bias_fp16
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/21_grouped_gemm_bias/
+├── grouped_gemm_fixed_nk_bias_fp16.cpp         # Main client example: grouped GEMM + bias (FP16)
+├── CMakeLists.txt                              # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `grouped_gemm_fixed_nk_bias_fp16.cpp`):  
+  Sets up input matrices for each group, configures GEMM and bias parameters, launches the grouped kernel, and verifies the result.
+- **Grouped GEMM kernel invocation**:  
+  Uses the Composable Kernel device API to launch grouped GEMM with bias addition.
+
+---
+
+## Additional Details
+
+- Supports multiple groups with different matrix shapes.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [15_grouped_gemm](../../example/15_grouped_gemm/README.md): Grouped GEMM in the main example directory
+- [11_convnd_fwd_bias](../../example/11_convnd_fwd_bias/README.md): Convolution with bias fusion
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/22_grouped_gemm/README.md
+++ b/client_example/22_grouped_gemm/README.md
@@ -0,0 +1,76 @@
+# Client Example: Grouped GEMM (Multiple Data Types)
+
+## Theory
+
+This client example demonstrates **grouped GEMM** for multiple data types (FP16, BF16, FP8, INT8). Grouped GEMM performs multiple independent GEMM operations (with potentially different shapes) in a single kernel launch, which is useful for transformer models, mixture-of-experts, and variable-length sequence processing.
+
+**Mathematical Formulation:**
+For $G$ groups, each with its own $A_g$, $B_g$:
+- GEMM: $Y_g = A_g \times B_g$
+
+**Algorithmic Background:**
+- Each group can have different matrix sizes and strides.
+- The kernel launches a grid covering all groups, with each block assigned to a group.
+- Supports multiple data types for flexibility and performance tuning.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/22_grouped_gemm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (FP16)
+./grouped_gemm_fixed_nk_fp16
+
+# Example run (BF16)
+./grouped_gemm_fixed_nk_bf16
+
+# Example run (FP8)
+./grouped_gemm_fixed_nk_fp8
+
+# Example run (INT8)
+./grouped_gemm_fixed_nk_i8
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/22_grouped_gemm/
+├── grouped_gemm_fixed_nk_fp16.cpp         # Grouped GEMM (FP16)
+├── grouped_gemm_fixed_nk_bf16.cpp         # Grouped GEMM (BF16)
+├── grouped_gemm_fixed_nk_fp8.cpp          # Grouped GEMM (FP8)
+├── grouped_gemm_fixed_nk_i8.cpp           # Grouped GEMM (INT8)
+├── CMakeLists.txt                         # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input matrices for each group, configures GEMM parameters, launches the grouped kernel, and verifies the result.
+- **Grouped GEMM kernel invocation**:  
+  Uses the Composable Kernel device API to launch grouped GEMM for different data types.
+
+---
+
+## Additional Details
+
+- Supports multiple groups with different matrix shapes and data types.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [15_grouped_gemm](../../example/15_grouped_gemm/README.md): Grouped GEMM in the main example directory
+- [17_grouped_gemm_fastgelu](../17_grouped_gemm_fastgelu/README.md): Grouped GEMM with FastGELU activation
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/23_elementwise_transpose/README.md
+++ b/client_example/23_elementwise_transpose/README.md
@@ -0,0 +1,64 @@
+# Client Example: Elementwise Operation with 3D Transpose
+
+## Theory
+
+This client example demonstrates **elementwise operations fused with 3D tensor transpose**. This pattern is used in deep learning for applying activation functions or scaling while simultaneously reordering tensor dimensions (e.g., for layout conversion or attention head reshaping).
+
+**Mathematical Formulation:**
+- Elementwise: $Z = f(X)$ or $Z = f(X, Y)$
+- Transpose: $Y_{i_0, i_1, i_2} = Z_{i_{\pi(0)}, i_{\pi(1)}, i_{\pi(2)}}$
+  - $\pi$ is a permutation of the axes.
+
+**Algorithmic Background:**
+- The elementwise operation and transpose are fused in a single kernel.
+- Intermediate results are kept in registers, not written to global memory.
+- Used for layout conversion with activation, attention head reshaping, and more.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/23_elementwise_transpose
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (elementwise + 3D transpose)
+./elementwise_transpose_3d
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/23_elementwise_transpose/
+├── elementwise_transpose_3d.cpp         # Main client example: elementwise + 3D transpose
+├── CMakeLists.txt                       # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `elementwise_transpose_3d.cpp`):  
+  Sets up input tensors, configures elementwise and transpose parameters, launches the fused kernel, and verifies the result.
+- **Fused kernel invocation**:  
+  Uses the Composable Kernel device API to launch the elementwise+transpose operation.
+
+---
+
+## Additional Details
+
+- Supports fusion of elementwise operations with 3D transpose.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [44_elementwise_permute](../../example/44_elementwise_permute/README.md): Elementwise operation with permutation in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/24_grouped_conv_activation/README.md
+++ b/client_example/24_grouped_conv_activation/README.md
@@ -0,0 +1,88 @@
+# Client Example: Grouped Convolution with Activation and Fusion
+
+## Theory
+
+This client example demonstrates **grouped convolution fused with various activation and elementwise operations**. Grouped convolution splits the input and weights into groups and applies convolution independently to each group, while fusion with activation and scaling improves efficiency.
+
+**Mathematical Formulation:**
+For each group $g$:
+- Convolution: $Y^g = \text{Conv}(X^g, W^g)$
+- Fused operations: $E^g = f(Y^g, D_0^g, D_1^g, ...)$
+  - $f$ can be bilinear, scale, add, relu, etc.
+
+**Algorithmic Background:**
+- Grouped convolution is used in efficient CNNs, depthwise separable convolutions, and expert models.
+- Fused epilogue operations (scale, add, relu, reduce) are performed in registers before writing to memory.
+- Supports 1D, 2D, and 3D grouped convolutions and a variety of fusion patterns.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/24_grouped_conv_activation
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (grouped conv + scale)
+./grouped_convnd_fwd_scale/grouped_convnd_fwd_scale
+
+# Example run (grouped conv + bilinear)
+./grouped_convnd_fwd_bilinear/grouped_convnd_fwd_bilinear
+
+# Example run (grouped conv + scale + relu)
+./grouped_convnd_fwd_convscale_relu/grouped_convnd_fwd_convscale_relu
+
+# Example run (grouped conv + scale + add + relu)
+./grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_convnd_fwd_scaleadd_scaleadd_relu
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/24_grouped_conv_activation/
+├── grouped_convnd_fwd_scale/                  # Grouped conv + scale
+├── grouped_convnd_fwd_bilinear/               # Grouped conv + bilinear
+├── grouped_convnd_fwd_convscale/              # Grouped conv + scale (convscale)
+├── grouped_convnd_fwd_convscale_add/          # Grouped conv + scale + add
+├── grouped_convnd_fwd_convscale_reduce/       # Grouped conv + scale + reduce
+├── grouped_convnd_fwd_convscale_relu/         # Grouped conv + scale + relu
+├── grouped_convnd_fwd_convinvscale/           # Grouped conv + inverse scale
+├── grouped_convnd_fwd_scaleadd_ab/            # Grouped conv + scale + add (A/B)
+├── grouped_convnd_fwd_scaleadd_scaleadd_relu/ # Grouped conv + scale + add + relu
+├── grouped_convnd_bwd_data_bilinear/          # Grouped conv bwd data + bilinear
+├── grouped_convnd_bwd_data_scale/             # Grouped conv bwd data + scale
+├── grouped_convnd_bwd_weight_bilinear/        # Grouped conv bwd weight + bilinear
+├── grouped_convnd_bwd_weight_scale/           # Grouped conv bwd weight + scale
+├── CMakeLists.txt                             # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each subdirectory's `.cpp`):  
+  Sets up input tensors, configures grouped convolution and fusion parameters, launches the kernel, and verifies the result.
+- **Grouped convolution kernel invocation**:  
+  Uses the Composable Kernel device API to launch grouped convolution with various fused epilogue operations.
+
+---
+
+## Additional Details
+
+- Supports a wide range of fusion patterns (bilinear, scale, add, relu, reduce, etc.).
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [10_grouped_convnd_bwd_data](../10_grouped_convnd_bwd_data/README.md): Grouped convolution backward data
+- [11_grouped_conv_bwd_weight](../11_grouped_conv_bwd_weight/README.md): Grouped convolution backward weight
+- [30_grouped_conv_fwd_multiple_d](../../example/30_grouped_conv_fwd_multiple_d/README.md): Grouped convolution forward with multiple D
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/25_wrapper/README.md
+++ b/client_example/25_wrapper/README.md
@@ -1,13 +1,70 @@
 [Back to the main page](../../README.md)
-# Composable Kernel wrapper GEMM tutorial

-This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) wrapper. We present the base version of GEMM without most of the available optimizations; however, it's worth noting that CK has kernels with different optimizations.
+# Composable Kernel Wrapper GEMM Tutorial

-To implement these optimizations, you can use the CK wrapper or directly use available instances in CK. You can also refer to the [optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), that uses CK wrapper based on the [`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
+This tutorial demonstrates how to implement matrix multiplication (GEMM) using the Composable Kernel wrapper. The three examples show both basic and optimized GEMM implementations, as well as how to use the wrapper for tensor transformations such as im2col.

-The kernel definition should look similar to:
+---
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/25_wrapper
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (basic GEMM)
+./wrapper_basic_gemm
+
+# Example run (optimized GEMM)
+./wrapper_optimized_gemm
+
+# Example run (im2col transformation)
+./wrapper_img2col
+
+# Example run (tensor transform using wrapper)
+./tensor_transform_using_wrapper
+```
+
+---
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/25_wrapper/
+├── wrapper_basic_gemm.cpp         # Basic GEMM using CK wrapper
+├── wrapper_optimized_gemm.cpp     # Optimized GEMM using CK wrapper
+├── wrapper_img2col.cpp            # im2col transformation using CK wrapper
+├── tensor_transform_using_wrapper.cpp # General tensor transform example
+├── CMakeLists.txt                 # Build configuration for the example
+├── README.md                      # This tutorial and reference
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input tensors, configures wrapper parameters, launches the kernel, and verifies the result.
+- **CK wrapper API usage**:  
+  Demonstrates how to create layouts, tensors, and launch GEMM or tensor transforms using the wrapper.
+
+---
+
+## Additional Details
+
+## Overview
+
+The CK wrapper provides a flexible interface for launching GEMM kernels and tensor operations. This tutorial presents:
+- A base GEMM implementation (minimal optimizations)
+- An optimized GEMM using `gridwise_gemm_xdlops_v2r3`
+- Examples of tensor transformations (e.g., im2col)

-```cpp
 template <typename DataType,
          typename GemmTraits,
          ck::index_t scalar_per_vector,
@@ -168,5 +225,13 @@ The end result from `c_vgpr_reg` is stored in the `C` local partition (tensor pe
    ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
 ```

-If you want to dive deep into the details, you can find the entire example
-[here](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_basic_gemm.cpp).
+---
+
+## Related Examples
+
+- [01_gemm](../01_gemm/README.md): Basic GEMM client example
+- [27_im2col_col2im](../27_im2col_col2im/README.md): im2col/col2im transformations
+- [25_gemm_bias_e_permute](../../example/25_gemm_bias_e_permute/README.md): GEMM with bias and permutation in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/26_reduce/CMakeLists.txt
+++ b/client_example/26_reduce/CMakeLists.txt
--- a/client_example/26_reduce/README.md
+++ b/client_example/26_reduce/README.md
@@ -0,0 +1,64 @@
+# Client Example: Parallel Reduction (NHWC)
+
+## Theory
+
+This client example demonstrates **parallel reduction operations** over NHWC tensors. Reduction is a fundamental operation in deep learning for computing statistics (such as batch mean/variance), loss aggregation, and normalization.
+
+**Mathematical Formulation:**
+Given a tensor $X[N, H, W, C]$ and a reduction axis (e.g., channel $C$):
+- **Sum**: $Y_{n,h,w} = \sum_c X_{n,h,w,c}$
+- **Max**: $Y_{n,h,w} = \max_c X_{n,h,w,c}$
+- **Mean**: $Y_{n,h,w} = \frac{1}{C} \sum_c X_{n,h,w,c}$
+
+**Algorithmic Background:**
+- Reductions are implemented using parallel tree or segmented reduction algorithms.
+- Efficient reductions require careful memory access, synchronization, and sometimes numerically stable algorithms.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/26_reduce
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (reduce over channel dimension)
+./reduce_nhwc_c
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/26_reduce/
+├── reduce_nhwc_c.cpp         # Main client example: reduction over NHWC tensors (channel axis)
+├── CMakeLists.txt            # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `reduce_nhwc_c.cpp`):  
+  Sets up input tensors, configures reduction parameters, launches the reduction kernel, and verifies the result.
+- **Reduction kernel invocation**:  
+  Uses the Composable Kernel device API to launch the reduction operation.
+
+---
+
+## Additional Details
+
+- Supports sum, max, mean, and other reductions over NHWC tensors.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [12_reduce](../../example/12_reduce/README.md): Parallel reduction in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/26_reduce/reduce_nhwc_c.cpp
+++ b/client_example/26_reduce/reduce_nhwc_c.cpp
--- a/client_example/27_im2col_col2im/CMakeLists.txt
+++ b/client_example/27_im2col_col2im/CMakeLists.txt
--- a/client_example/27_im2col_col2im/README.md
+++ b/client_example/27_im2col_col2im/README.md
@@ -0,0 +1,68 @@
+# Client Example: im2col and col2im Transformations
+
+## Theory
+
+This client example demonstrates **im2col (image-to-column) and col2im (column-to-image) transformations**. These operations are used to convert image data into a matrix form suitable for GEMM-based convolution and reconstruct images from column representations.
+
+**Mathematical Formulation:**
+- **im2col**: Rearranges image blocks into columns, mapping a 3D/4D tensor to a 2D matrix.
+- **col2im**: Reverses the process, mapping a 2D matrix back to an image tensor.
+
+**Algorithmic Background:**
+- im2col is used to lower convolution to matrix multiplication (GEMM).
+- col2im is used to reconstruct the original image or feature map from the column representation.
+- These transformations are essential for efficient convolution implementations on GPUs.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/27_im2col_col2im
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (image to column)
+./image_to_column
+
+# Example run (column to image)
+./column_to_image
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/27_im2col_col2im/
+├── image_to_column.cpp         # im2col: image to column transformation
+├── column_to_image.cpp         # col2im: column to image transformation
+├── CMakeLists.txt              # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input tensors, configures transformation parameters, launches the im2col or col2im kernel, and verifies the result.
+- **im2col/col2im kernel invocation**:  
+  Uses the Composable Kernel device API to launch the transformation.
+
+---
+
+## Additional Details
+
+- Supports various image and patch sizes.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [52_im2col_col2im](../../example/52_im2col_col2im/README.md): im2col/col2im in the main example directory
+- [09_convnd_fwd](../../example/09_convnd_fwd/README.md): N-dimensional convolution using im2col
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/27_im2col_col2im/column_to_image.cpp
+++ b/client_example/27_im2col_col2im/column_to_image.cpp
--- a/client_example/27_im2col_col2im/image_to_column.cpp
+++ b/client_example/27_im2col_col2im/image_to_column.cpp
--- a/client_example/28_gemm_mx/CMakeLists.txt
+++ b/client_example/28_gemm_mx/CMakeLists.txt
--- a/client_example/28_gemm_mx/README.md
+++ b/client_example/28_gemm_mx/README.md
@@ -0,0 +1,34 @@
+# Client Example: GEMM pipeline for microscaling (MX)
+
+## How to Run
+
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+```bash
+cd composable_kernel/build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -D DTYPES="fp8" ..
+make -j
+make install
+```
+
+### Build and run
+```bash
+/opt/rocm/bin/hipcc gemm_mx_fp8.cpp -o gemm_mx_fp8
+
+# Example run
+./gemm_mx_fp8
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/28_gemm_mx/
+├── gemm_mx_fp8.cpp       # GEMM MX (fp8)
+├── CMakeLists.txt        # Build configuration for the example
+```
+---
+[Back to Client Examples](../README.md)
--- a/client_example/28_gemm_mx/gemm_mx_fp8.cpp
+++ b/client_example/28_gemm_mx/gemm_mx_fp8.cpp
--- a/client_example/29_gemm_add_multiply/CMakeLists.txt
+++ b/client_example/29_gemm_add_multiply/CMakeLists.txt
--- a/client_example/29_gemm_add_multiply/README.md
+++ b/client_example/29_gemm_add_multiply/README.md
@@ -0,0 +1,66 @@
+# Client Example: GEMM with Add and Multiply Fusion
+
+## Theory
+
+This client example demonstrates **GEMM fused with addition and multiplication operations**. This pattern is used in neural networks for bias addition, scaling, gating, and other elementwise transformations after a linear layer.
+
+**Mathematical Formulation:**
+- GEMM: $Y = A \times B$
+- Add: $Z = Y + D_0$
+- Multiply: $E = Z \odot D_1$
+  - $D_0$, $D_1$: auxiliary tensors (e.g., bias, scale, gate)
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, addition and multiplication are fused in the epilogue.
+- No intermediate results are written to global memory.
+- Used for bias+scale, gating, and other fused epilogue patterns.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/client_example/29_gemm_add_multiply
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_add_multiply
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/29_gemm_add_multiply/
+├── gemm_add_multiply.cpp         # Main client example: GEMM+Add+Multiply
+├── CMakeLists.txt                # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in `gemm_add_multiply.cpp`):  
+  Sets up input matrices, configures GEMM and epilogue parameters, launches the fused kernel, and verifies the result.
+- **Fused kernel invocation**:  
+  Uses the Composable Kernel device API to launch the GEMM with fused addition and multiplication.
+
+---
+
+## Additional Details
+
+- Supports fusion of multiple elementwise operations with GEMM.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [02_gemm_bilinear](../../example/02_gemm_bilinear/README.md): Multi-tensor bilinear operations
+- [46_gemm_add_multiply](../../example/46_gemm_add_multiply/README.md): GEMM with add and multiply in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/29_gemm_add_multiply/gemm_add_multiply.cpp
+++ b/client_example/29_gemm_add_multiply/gemm_add_multiply.cpp
--- a/client_example/30_gemm_bf16Aint8B/README.md
+++ b/client_example/30_gemm_bf16Aint8B/README.md
@@ -0,0 +1,92 @@
+# Client Example: GEMM with bf16A/int8B and Fused Epilogues
+
+## Theory
+
+This client example demonstrates **GEMM with mixed-precision input types (bf16 for A, int8 for B)** and various fused epilogue operations (bias, GELU, FastGELU, multiply). Mixed-precision GEMM is used for efficient inference and training in deep learning, especially for transformer and MLP layers.
+
+**Mathematical Formulation:**
+- GEMM: $Y = A \times B$
+  - $A$: bf16 (brain floating point)
+  - $B$: int8 (8-bit integer)
+- Fused epilogues:
+  - Bias: $Z = Y + \text{bias}$
+  - GELU: $E = \text{GELU}(Z)$
+  - FastGELU: $E = \text{FastGELU}(Z)$
+  - Multiply: $E = Z \odot D_1$
+
+**Algorithmic Background:**
+- Mixed-precision computation reduces memory and compute requirements.
+- Fused epilogues improve efficiency by combining bias, activation, and scaling in a single kernel.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+```bash
+cd composable_kernel/build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -D DTYPES="bf16;int8" ..
+make -j
+make install
+```
+
+### Build and run
+```bash
+cd composable_kernel/client_example/30_gemm_bf16Aint8B
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (basic GEMM)
+./gemm_xdl_bf16_i8
+
+# Example run (GEMM + bias)
+./gemm_bias_xdl_bf16_i8
+
+# Example run (GEMM + bias + GELU)
+./gemm_xdl_gelu_bf16_i8
+
+# Example run (GEMM + bias + FastGELU)
+./gemm_bias_fastgelu_xdl_bf16_i8
+
+# Example run (GEMM + multiply)
+./gemm_xdl_multiply_bf16_i8
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/30_gemm_bf16Aint8B/
+├── gemm_xdl_bf16_i8.cpp                # GEMM (bf16A, int8B)
+├── gemm_bias_xdl_bf16_i8.cpp           # GEMM + bias
+├── gemm_xdl_gelu_bf16_i8.cpp           # GEMM + bias + GELU
+├── gemm_bias_fastgelu_xdl_bf16_i8.cpp  # GEMM + bias + FastGELU
+├── gemm_xdl_multiply_bf16_i8.cpp       # GEMM + multiply
+├── CMakeLists.txt                      # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input matrices, configures GEMM and epilogue parameters, launches the kernel, and verifies the result.
+- **Fused kernel invocation**:  
+  Uses the Composable Kernel device API to launch GEMM with various fused epilogues.
+
+---
+
+## Additional Details
+
+- Supports bf16 and int8 input types for efficient mixed-precision computation.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [14_gemm_quantization](../../example/14_gemm_quantization/README.md): GEMM quantization in the main example directory
+- [46_gemm_add_multiply](../../example/46_gemm_add_multiply/README.md): GEMM with add and multiply in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/client_example/31_grouped_gemm_bf16Aint8B/README.md
+++ b/client_example/31_grouped_gemm_bf16Aint8B/README.md
@@ -0,0 +1,93 @@
+# Client Example: Grouped GEMM with bf16A/int8B and Fused Epilogues
+
+## Theory
+
+This client example demonstrates **grouped GEMM with mixed-precision input types (bf16 for A, int8 for B)** and various fused epilogue operations (bias, FastGELU, multiply). Grouped GEMM performs multiple independent GEMM operations (with potentially different shapes) in a single kernel launch, and mixed-precision is used for efficient inference and training.
+
+**Mathematical Formulation:**
+For $G$ groups, each with its own $A_g$, $B_g$:
+- GEMM: $Y_g = A_g \times B_g$
+  - $A_g$: bf16 (brain floating point)
+  - $B_g$: int8 (8-bit integer)
+- Fused epilogues:
+  - Bias: $Z_g = Y_g + \text{bias}_g$
+  - FastGELU: $E_g = \text{FastGELU}(Z_g)$
+  - Multiply: $E_g = Z_g \odot D_{1,g}$
+
+**Algorithmic Background:**
+- Each group can have different matrix sizes and strides.
+- Mixed-precision computation reduces memory and compute requirements.
+- Fused epilogues improve efficiency by combining bias, activation, and scaling in a single kernel.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+```bash
+cd composable_kernel/build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -D DTYPES="bf16;int8" ..
+make -j
+make install
+```
+
+### Build and run
+```bash
+cd composable_kernel/client_example/31_grouped_gemm_bf16Aint8B
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (basic grouped GEMM)
+./grouped_gemm_xdl_bf16_i8
+
+# Example run (grouped GEMM + bias + FastGELU)
+./grouped_gemm_bias_fastgelu_xdl_bf16_i8
+
+# Example run (grouped GEMM + FastGELU)
+./grouped_gemm_fastgelu_xdl_bf16_i8
+
+# Example run (grouped GEMM + multiply)
+./grouped_gemm_multiply_xdl_bf16_i8
+
+# Example run (grouped GEMM + multiply + bias + FastGELU)
+./grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+client_example/31_grouped_gemm_bf16Aint8B/
+├── grouped_gemm_xdl_bf16_i8.cpp                # Grouped GEMM (bf16A, int8B)
+├── grouped_gemm_bias_fastgelu_xdl_bf16_i8.cpp  # Grouped GEMM + bias + FastGELU
+├── grouped_gemm_fastgelu_xdl_bf16_i8.cpp       # Grouped GEMM + FastGELU
+├── grouped_gemm_multiply_xdl_bf16_i8.cpp       # Grouped GEMM + multiply
+├── grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp # Grouped GEMM + multiply + bias + FastGELU
+├── CMakeLists.txt                              # Build configuration for the example
+```
+
+### Key Functions
+
+- **main()** (in each `.cpp`):  
+  Sets up input matrices for each group, configures GEMM and epilogue parameters, launches the grouped kernel, and verifies the result.
+- **Grouped GEMM kernel invocation**:  
+  Uses the Composable Kernel device API to launch grouped GEMM with various fused epilogues.
+
+---
+
+## Additional Details
+
+- Supports multiple groups with different matrix shapes and bf16/int8 input types.
+- Example parameters can be adjusted in the source for different workloads.
+
+---
+
+## Related Examples
+
+- [30_gemm_bf16Aint8B](../30_gemm_bf16Aint8B/README.md): GEMM with bf16A/int8B and fused epilogues
+- [15_grouped_gemm](../../example/15_grouped_gemm/README.md): Grouped GEMM in the main example directory
+
+---
+[Back to Client Examples](../README.md)
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -99,6 +99,9 @@ else()
                -Wno-unused-lambda-capture
                -Wno-nvcc-compat
            )
+            if(CK_CXX_STANDARD GREATER_EQUAL 20)
+                list(APPEND CMAKE_COMPILER_WARNINGS -Wno-c++20-compat)
+            endif()
        else()
            if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")
                # cmake 3.5.2 does not support >=.
--- a/cmake/gtest.cmake
+++ b/cmake/gtest.cmake
@@ -1,3 +1,4 @@
+include_guard(GLOBAL)
 include(FetchContent)

 set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against")
@@ -12,6 +13,17 @@ FetchContent_Declare(
    GIT_TAG f8d7d77c06936315286eb55f8de22cd23c188571
 )

+FetchContent_Populate(GTest)
+
+# Patch googlemock/CMakeLists.txt to fix invalid include path
+set(GMOCK_CMAKE "${gtest_SOURCE_DIR}/googlemock/CMakeLists.txt")
+file(READ "${GMOCK_CMAKE}" GMOCK_CMAKE_CONTENT)
+string(REPLACE [[gtest_SOURCE_DIR}/include]]
+               [[gtest_SOURCE_DIR}/googletest/include]]
+               GMOCK_CMAKE_CONTENT
+               "${GMOCK_CMAKE_CONTENT}")
+file(WRITE "${GMOCK_CMAKE}" "${GMOCK_CMAKE_CONTENT}")
+
 # Suppress ROCMChecks WARNING on GoogleTests
 set(ROCM_DISABLE_CHECKS FALSE)
 macro(rocm_check_toolchain_var var access value list_file)
@@ -24,7 +36,7 @@ if(WIN32)
    set(gtest_force_shared_crt ON CACHE_INTERNAL "")
 endif()

-set(BUILD_GMOCK OFF CACHE INTERNAL "")
+set(BUILD_GMOCK ON CACHE INTERNAL "")
 set(INSTALL_GTEST OFF CACHE INTERNAL "")

 # Store the current value of BUILD_SHARED_LIBS
@@ -32,15 +44,12 @@ set(__build_shared_libs ${BUILD_SHARED_LIBS})
 set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "")

 set(ROCM_DISABLE_CHECKS TRUE)
-FetchContent_MakeAvailable(GTest)
+add_subdirectory(${gtest_SOURCE_DIR} ${gtest_BINARY_DIR})
 set(ROCM_DISABLE_CHECKS FALSE)

 # Restore the old value of BUILD_SHARED_LIBS
 set(BUILD_SHARED_LIBS ${__build_shared_libs} CACHE BOOL "Type of libraries to build" FORCE)

-set(BUILD_GMOCK OFF CACHE INTERNAL "")
-set(INSTALL_GTEST OFF CACHE INTERNAL "")
-
 set(GTEST_CXX_FLAGS
     -Wno-undef
     -Wno-reserved-identifier
@@ -71,3 +80,12 @@ target_compile_options(gtest_main PRIVATE ${GTEST_CXX_FLAGS})
 target_compile_definitions(gtest PRIVATE GTEST_HAS_SEH=0)
 target_compile_definitions(gtest_main PRIVATE GTEST_HAS_SEH=0)

+if(TARGET gmock)
+    target_compile_options(gmock PRIVATE ${GTEST_CXX_FLAGS})
+    target_compile_definitions(gmock PRIVATE GTEST_HAS_SEH=0)
+endif()
+
+if(TARGET gmock_main)
+    target_compile_options(gmock_main PRIVATE ${GTEST_CXX_FLAGS})
+    target_compile_definitions(gmock_main PRIVATE GTEST_HAS_SEH=0)
+endif()
--- a/example/01_gemm/README.md
+++ b/example/01_gemm/README.md
@@ -1,27 +1,221 @@
-# Instructions for ```example_gemm_xdl```
+[Back to supported operations](../../../include/ck/README.md)
+# Composable Kernel GEMM Example
+
+## Introduction
+
+GEMM (General Matrix Multiplication) is a fundamental operation in linear algebra and deep learning. It computes the product of two matrices, optionally adds a bias or residual, and is the core of many neural network layers (MLPs, attention, convolutions via im2col). This example demonstrates the flexible and high-performance GEMM API provided by Composable Kernel.
+
+---
+
+## Theory
+
+**Mathematical Formulation:**
+$$
+C = \alpha (A \times B) + \beta D
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $D$: [M, N] optional bias/residual
+- $C$: [M, N] output
+- $\alpha, \beta$: scalars (often 1.0, 0.0)
+
+GEMM is implemented using a tiled/blocking strategy to maximize data reuse and memory bandwidth. Modern GPU implementations use matrix core/XDL/MFMA instructions for high throughput. The operation is the computational backbone for transformer attention, MLPs, CNNs (via lowering), and more.
+
+---
+
+## CK GEMM API Overview
+
+CK provides a highly composable GEMM API via the `DeviceGemm` family of device operations. These are highly templated to support a wide range of data types, layouts, and fused operations.
+
+### Template Parameters
+
+- **ALayout** - A matrix layout (RowMajor/ColumnMajor)
+- **BLayout** - B matrix layout (RowMajor/ColumnMajor)
+- **CLayout** - C matrix layout (RowMajor/ColumnMajor)
+- **ADataType** - A matrix data type
+- **BDataType** - B matrix data type
+- **CDataType** - C matrix data type
+- **AElementwiseOperation** - Fused operation on tensor A before GEMM
+- **BElementwiseOperation** - Fused operation on tensor B before GEMM
+- **CElementwiseOperation** - Fused operation on tensor C after GEMM
+
+For large K dimension, use `DeviceGemmSplitK` to split K across workgroups (requires zeroing output buffer due to use of AtomicAdd).
+
+For fused operations with additional tensors, use `DeviceGemmMultipleABD` or `DeviceGemmMultipleD`:
+- **DsLayout** - layouts for additional tensors
+- **DsDataType** - data types for additional tensors
+
+For `DeviceGemmMultipleABD`, pass **ALayout**, **BLayout**, **ADataType**, **BDataType** as tuples.
+
+---
+
+## Supported GEMM Variants
+
+- **DeviceGemm**: Standard GEMM
+- **DeviceGemmSplitK**: Split-K GEMM for large K
+- **DeviceGemmMultipleABD**: Fused GEMM with multiple A/B/D tensors
+- **DeviceGemmMultipleD**: Fused GEMM with multiple D tensors
+
+---
+
+## Supported Device Operations
+
+- **DeviceGemmDl**: DL instructions
+- **DeviceGemmDpp**: DL instructions with DPP during data load
+- **DeviceGemmWmma_CShuffle**: WMMA instructions with CShuffle optimization
+- **DeviceGemm_Xdl_CShuffle_LdsDirectLoad**: XDL instructions, CShuffle, direct global-to-shared load
+- **DeviceGemm_Xdl_CShuffle**: XDL instructions with CShuffle
+- **DeviceGemm_Xdl_CShuffleV2**: XDL instructions, optimized pipeline vs. V1
+- **DeviceGemmXdlSkipBLds**: XDL, skips shared memory load for B
+- **DeviceGemm_Xdl_WaveletModel_CShuffle**: XDL, CShuffle, wavelet producer/consumer
+- **DeviceGemmXdl**: XDL instructions
+
+---
+
+## Supported Data Types and Layouts
+
+### XDL Instruction
+
+|       |Is supported|
+|-------|---|
+|bf16   |✔️|
+|fp16   |✔️|
+|fp32   |✔️|
+|int8   |✔️|
+|fp8    |✔️|
+
+### WMMA Instruction
+
+|       |Is supported|
+|-------|---|
+|bf16   |✔️|
+|fp16   |✔️|
+|fp32   |❌|
+|int8   |✔️|
+|fp8    |❌|
+
+### DL Instruction
+
+|       |Is supported|
+|-------|---|
+|bf16   |❌|
+|fp16   |✔️|
+|fp32   |✔️|
+|int8   |✔️|
+|fp8    |❌|
+
+---
+
+## Supported Fused Elementwise Operations
+
+- **B Matrix Multiply + Add + Gelu** - bf16 (int8 for B matrix)
+- **B Matrix Multiply + Add** - bf16 (int8 for B matrix)
+- **B Matrix Multiply + Gelu** - bf16 (int8 for B matrix)
+- **B Matrix Multiply** - bf16 (int8 for B matrix)
+- **Add + Add + Gelu** - fp16
+- **Add + Gelu** - fp16, bf16 (int8 for B matrix) for Row/Column/Row
+- **Multiply** - fp16
+- **Add + Multiply** - fp16
+- **Add + Relu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
+- **Add + Silu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
+- **Add** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
+- **Bilinear** - fp16, int8
+- **Gelu** - fp16
+- **Multiply + Add** - fp16 for Row/Column/Row and Row/Row/Row, fp16 (int8 for B matrix, fp32 for Bias) for Row/Column/Row and Row/Row/Row
+- **Quantization** - int8
+
+---
+
+## GEMM V2 (Universal GEMM)
+
+Optimized for MI300 series. Operation is called as `DeviceGemmV2` and uses similar template parameters as above.
+
+- **ALayout**, **BLayout**, **CLayout**
+- **ADataType**, **BDataType**, **CDataType**
+- **AElementwiseOperation**, **BElementwiseOperation**, **CElementwiseOperation**
+
+Split-K is supported (requires zeroing output buffer if splitK > 1).
+
+### Device Operations
+
+- **DeviceGemm_Xdl_CShuffleV3**: XDL with CShuffle optimization
+- **DeviceGemm_Xdl_CShuffleV3R1**: XDL with CShuffle, reduction on split-K after GEMM
+
+### Supported Types
+
+|       |Is supported|
+|-------|---|
+|bf16   |✔️|
+|fp16   |✔️|
+|fp32   |❌|
+|int8   |❌|
+|fp8 (C bf16)|✔️|
+|fp16 (A fp8)|✔️|
+|fp16 (B fp8)|✔️|
+
+---
+
+## Other GEMM Extensions
+
+- **DeviceGemm_dequantB**: GEMM with dequantization (WMMA)
+- **DeviceGemmMultipleD_ABScale**: GEMM with scale for A and B
+- **DeviceGemmMultipleDLayernorm**: GEMM fused with layernorm
+- **DeviceGemmMultipleDMultipleR**: GEMM fused with reductions and custom global reductions
+- **DeviceGemmReduce**: GEMM fused with reduction
+- **DeviceGemm_Streamk_V2**: Stream K with reduction instead of AtomicAdd
+- **DeviceGemmStreamK**: Stream K using AtomicAdd
+
+---
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run

-## Run ```example_gemm_xdl```
 ```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-./bin/example_gemm_xdl 0 1 5
+cd composable_kernel/example/01_gemm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (FP16)
+./gemm_xdl_fp16 -M 4096 -N 4096 -K 4096 -v 1 -t 1
 ```

-# Instructions for ```example_gemm_xdl_fp16_streamk_v3```
+---
+
+## Source Code Structure

-## Run ```example_gemm_xdl_fp16_streamk_v3```
-```bash
-arg1: verification (0=no, 1=yes)
-arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-arg3: time kernel (0=no, 1=yes)
-arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
-arg10: stream-k select (-1: default config, 0: all DP, 1: 1-tile SK, 2: 2-tile SK)
-arg11: Grid_size(-1 for max occupancy)
-bin/example_gemm_xdl_fp16_streamk_v3 1 2 1 3840 4096 4096 4096 4096 4096 1 -1
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-problem {M:3840, N:4096, K:4096, SA:4096, SB:4096, SC:4096, MP:4032, NP:4096, KRead:4096, KP:4096, AK0:512, BK0:2048, MBlock: 18, NBlock: 16, Stream-K Selection:1, Grid size:-1}
-Perf: 0.292022 ms, 441.23 TFlops, 330.348 GB/s, DeviceGemmXdlUniversal<MNPadding, RRR> BlkSize: 256, BlkTile: 224x256x64, WaveTile: 16x16, WaveMap: 7x8, VmemReadVec: 8x8, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2
 ```
+example/01_gemm/
+├── gemm_xdl_fp16.cpp         # Main example: sets up, runs, and verifies GEMM (FP16)
+├── gemm_xdl_fp32.cpp         # Main example: FP32 variant
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm.hpp       # Device-level GEMM API (templated)
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_xdl.hpp   # XDL-based GEMM implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_xdl.hpp # Grid-level tiled GEMM kernel
+include/ck/tensor_operation/gpu/block/
+│   └── blockwise_gemm_xdl.hpp # Block-level tiled GEMM
+library/reference_tensor_operation/cpu/
+    └── reference_gemm.hpp    # CPU reference GEMM for correctness checking
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmXdl** (in `device_gemm.hpp`):  
+  Main device API for launching GEMM kernels.  
+- **GridwiseGemmXdl** (in `gridwise_gemm_xdl.hpp`):  
+  Implements the tiled/blocking GEMM kernel for the GPU grid.
+- **BlockwiseGemmXdl** (in `blockwise_gemm_xdl.hpp`):  
+  Handles block-level computation and shared memory tiling.
+- **reference_gemm** (in `reference_gemm.hpp`):  
+  CPU implementation for result verification.
+
+---
+
+This example is the foundation for all matrix operations in Composable Kernel and is the basis for more advanced fused and batched operations.
--- a/example/01_gemm/gemm_wmma_fp16_v3.cpp
+++ b/example/01_gemm/gemm_wmma_fp16_v3.cpp
@@ -26,17 +26,18 @@ using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuf
    ALayout, BLayout, CLayout,
    ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
    PassThrough, PassThrough, PassThrough, GemmDefault,
-    128,
-    128, 64,
-    64, 8, 8,
+    256,
+    128, 256, 64,
+    8, 8,
    16, 16,
-    4, 2,
-    S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    2, 8,
+    S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
    1, 1, 8, 1,
-    S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
    1, 1, 8, 1,
-    1, 1, S<1, 32, 1, 4>, 8,
-    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3>;
+    1, 1,
+    S<1, 64, 1, 4>, 8,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::
--- a/example/02_gemm_bilinear/README.md
+++ b/example/02_gemm_bilinear/README.md
@@ -1,6 +1,78 @@
-# Instructions for ```example_gemm_bilinear_xdl_fp16```
+# Composable Kernel GEMM Bilinear Example
+
+## Introduction
+
+This example demonstrates GEMM (General Matrix Multiplication) fused with bilinear operations on auxiliary tensors using Composable Kernel. Bilinear fusion patterns are widely used in neural networks for gating, attention, and multimodal feature fusion, where the output of a matrix multiplication is combined elementwise with one or more additional tensors.
+
+---
+
+## Theory
+
+**Mathematical Formulation:**
+$$
+F = \text{BilinearOp}(A \times B, D, E)
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $D$, $E$: [M, N] auxiliary tensors (or broadcastable)
+- $F$: [M, N] output
+
+**Examples:**
+- Elementwise: $F = (A \times B) \odot D \odot E$
+- Gated: $F = (A \times B) \odot \sigma(D) + E$
+- Weighted: $F = \alpha (A \times B) + \beta (D \odot E)$
+
+The GEMM result is kept in registers and combined with auxiliary tensors in the epilogue, avoiding intermediate writes to global memory. This pattern is common in attention, gating, and feature interaction layers.
+
+---
+
+## CK GEMM Bilinear API Overview
+
+CK provides a composable API for GEMM with multiple auxiliary tensors via the `DeviceGemmMultipleD` operation.
+
+### Template Parameters
+
+- **ALayout** - A matrix layout (RowMajor/ColumnMajor)
+- **BLayout** - B matrix layout (RowMajor/ColumnMajor)
+- **DsLayout** - Layouts for auxiliary tensors (tuple)
+- **ELayout** - Output matrix layout (RowMajor/ColumnMajor)
+- **ADataType** - A matrix data type
+- **BDataType** - B matrix data type
+- **DsDataType** - Data types for auxiliary tensors (tuple)
+- **EDataType** - Output matrix data type
+- **AElementwiseOperation** - Fused operation on tensor A before GEMM
+- **BElementwiseOperation** - Fused operation on tensor B before GEMM
+- **CDEElementwiseOperation** - Fused operation on C, D, E after GEMM
+
+### Supported Data Types and Layouts
+
+- Supports fp16, int8, and other types depending on the device operation.
+- Supports RowMajor and ColumnMajor layouts for all tensors.
+
+### Supported Device Operations
+
+- **DeviceGemmMultipleD**: Standard multi-tensor GEMM
+- **DeviceGemmMultipleD_Bilinear**: GEMM with bilinear fusion in the epilogue
+
+---
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+
+```bash
+cd composable_kernel/example/02_gemm_bilinear
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+```
+### Run ```example_gemm_bilinear_xdl_fp16```

-## Run ```example_gemm_bilinear_xdl_fp16```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -9,3 +81,35 @@
 #arg11 to 12: alpha, beta
 ./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5
 ```
+
+---
+
+## Source Code Structure
+
+```
+example/02_gemm_bilinear/
+├── gemm_bilinear_xdl.cpp         # Main example: sets up, runs, and verifies GEMM with bilinear fusion
+├── gemm_bilinear_wmma_fp16.cpp   # WMMA FP16 variant
+├── gemm_bilinear_wmma_int8.cpp   # WMMA int8 variant
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_multiple_d.hpp       # Device-level API for multi-tensor GEMM
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_bilinear_impl.hpp    # Bilinear operation implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_multiple_d.hpp     # Grid-level multi-tensor GEMM kernel
+include/ck/tensor_operation/gpu/element/
+    └── element_wise_operation.hpp       # Elementwise operation definitions
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmMultipleD** (in `device_gemm_multiple_d.hpp`):  
+  Device API for GEMM with multiple auxiliary tensors and fused epilogues.
+- **gridwise_gemm_multiple_d** (in `gridwise_gemm_multiple_d.hpp`):  
+  Implements the tiled/blocking GEMM kernel with multi-tensor epilogue.
+- **element_wise_operation** (in `element_wise_operation.hpp`):  
+  Defines bilinear and other elementwise operations.
+
+---
+
+This example demonstrates how Composable Kernel supports complex multi-tensor fusion patterns for advanced neural network architectures.
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
@@ -43,8 +43,9 @@ using S = ck::Sequence<Is...>;
 using F16 = ck::half_t;
 using F32 = float;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -190,11 +191,11 @@ int main(int argc, char* argv[])

            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
@@ -43,8 +43,9 @@ using S = ck::Sequence<Is...>;
 using I8  = std::int8_t;
 using I32 = std::int32_t;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -190,11 +191,11 @@ int main(int argc, char* argv[])

            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -42,8 +42,9 @@ using S = ck::Sequence<Is...>;
 using F16 = ck::half_t;
 using F32 = float;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -173,7 +174,7 @@ int main(int argc, char* argv[])
        printf("arg3: time kernel (0=no, 1=yes)\n");
        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
               "beta\n");
-        exit(0);
+        exit(1);
    }

    auto f_host_tensor_descriptor =
@@ -182,11 +183,11 @@ int main(int argc, char* argv[])

            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

--- a/example/03_gemm_bias_relu/README.md
+++ b/example/03_gemm_bias_relu/README.md
@@ -1,10 +1,63 @@
-# Instructions for ```example_gemm_bias_relu_xdl_fp16```
+# GEMM with Bias and ReLU Activation Fusion

-## Run ```example_gemm_bias_relu_xdl_fp16```
+## Theory
+
+This example demonstrates **GEMM fused with bias addition and ReLU activation**. This is the core pattern for fully connected (dense) neural network layers and the feed-forward blocks in transformers.
+
+**Mathematical Formulation:**
+$$
+E = \text{ReLU}(A \times B + \text{bias})
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $\text{bias}$: [N] bias vector (broadcasted)
+- $E$: [M, N] output
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, bias is added, and ReLU is applied before writing to global memory.
+- This fusion eliminates intermediate memory traffic and is a standard optimization in deep learning frameworks.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
 ```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: time kernel (0=no, 1=yes)
-#arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE
-./bin/example_gemm_bias_relu_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096
+cd composable_kernel/example/03_gemm_bias_relu
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_bias_relu_xdl -M 2048 -N 8192 -K 2048 --verify=1 --time=1
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/03_gemm_bias_relu/
+├── gemm_bias_relu_xdl.cpp         # Main example: sets up, runs, and verifies GEMM+Bias+ReLU
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_multiple_d.hpp         # Device-level API for multi-tensor GEMM
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_xdl_cshuffle_v3.hpp    # XDL with C-Shuffle epilogue
+│   └── device_gemm_bias_relu_impl.hpp     # Specialized bias+ReLU implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_xdl_cshuffle.hpp     # Grid-level GEMM with epilogue
+include/ck/tensor_operation/gpu/element/
+    └── element_wise_operation.hpp         # Elementwise operation definitions
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmMultipleD** (in `device_gemm_multiple_d.hpp`):  
+  Device API for GEMM with auxiliary tensors and fused epilogues.
+- **gridwise_gemm_xdl_cshuffle** (in `gridwise_gemm_xdl_cshuffle.hpp`):  
+  Implements the tiled/blocking GEMM kernel with fused epilogue.
+- **element_wise_operation** (in `element_wise_operation.hpp`):  
+  Defines bias addition and ReLU activation.
+
+This example demonstrates the standard epilogue fusion concept that enables efficient neural network layers in modern deep learning.
--- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -25,8 +25,9 @@ using S = ck::Sequence<Is...>;
 using F16 = ck::half_t;
 using F32 = float;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -160,23 +161,22 @@ int main(int argc, char* argv[])

            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

+    ck::index_t StrideD = 0;
+
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, 0, ELayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, ELayout{}));
    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));

-    const auto StrideD = std::is_same<decltype(ELayout{}), ck::tensor_layout::gemm::RowMajor>::value
-                             ? d_m_n.mDesc.GetStrides()[0]
-                             : d_m_n.mDesc.GetStrides()[1];
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
--- a/example/04_gemm_add_add_fastgelu/README.md
+++ b/example/04_gemm_add_add_fastgelu/README.md
@@ -1,10 +1,70 @@
-# Instructions for ```example_gemm_add_add_fastgelu_xdl_fp16```
+# GEMM with Add, Add, and FastGELU Activation

-## Run ```example_gemm_add_add_fastgelu_xdl_fp16```
+## Theory
+
+This example demonstrates a **GEMM operation fused with two addition operations and FastGELU activation**. This pattern is used in transformer feed-forward networks and other neural architectures where a linear transformation is followed by bias addition, residual addition, and a non-linear activation.
+
+**Mathematical Formulation:**
+$$
+E = \text{FastGELU}((A \times B) + D_0 + D_1)
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $D_0$: [N] bias vector (broadcasted)
+- $D_1$: [M, N] residual tensor
+- $E$: [M, N] output
+
+FastGELU is an efficient approximation of GELU:
+$$
+\text{FastGELU}(x) = x \cdot \sigma(1.702 \cdot x)
+$$
+where $\sigma$ is the sigmoid function.
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, bias and residual are added, and FastGELU is applied before writing to global memory.
+- No intermediate results are written to global memory.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
 ```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: time kernel (0=no, 1=yes)
-#arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
-./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1
+cd composable_kernel/example/04_gemm_add_add_fastgelu
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_add_add_fastgelu_xdl -M 2048 -N 8192 -K 2048 --verify=1 --time=1
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/04_gemm_add_add_fastgelu/
+├── gemm_add_add_fastgelu_xdl.cpp         # Main example: sets up, runs, and verifies GEMM+Add+Add+FastGELU
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_multiple_d.hpp         # Device-level API for multi-tensor GEMM
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_xdl_cshuffle_v3.hpp    # XDL with C-Shuffle epilogue
+│   └── device_gemm_fastgelu_impl.hpp      # FastGELU-specific implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_multiple_d_xdl.hpp   # Grid-level multi-stage GEMM
+include/ck/tensor_operation/gpu/element/
+    └── element_wise_operation.hpp         # Elementwise operation definitions
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmMultipleD** (in `device_gemm_multiple_d.hpp`):  
+  Device API for GEMM with multiple auxiliary tensors and fused epilogues.
+- **gridwise_gemm_multiple_d_xdl** (in `gridwise_gemm_multiple_d_xdl.hpp`):  
+  Implements the tiled/blocking GEMM kernel with multi-stage epilogue.
+- **element_wise_operation** (in `element_wise_operation.hpp`):  
+  Defines FastGELU and other elementwise operations.
+
+This example demonstrates how Composable Kernel supports complex multi-stage epilogue fusion for advanced neural network architectures.
--- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
@@ -6,6 +6,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
 #endif
    using namespace ck::literals;
+    using Bypass = ck::tensor_layout::BypassLayoutVerification;

    ProblemSize ps =
        problem_size; // make mutable copy because default stride values of 0 need to be updated
@@ -15,11 +16,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

@@ -43,7 +44,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;

-    // If any user-provided leading stride <= 0, replace it with the one determined by the
+    // If any user-provided leading stride < 0, replace it with the one determined by the
    // created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1.
    auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int {
        if constexpr(std::is_same_v<decltype(layout_tag), ck::tensor_layout::gemm::RowMajor>)
@@ -56,15 +57,15 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
        }
    };

-    if(StrideA <= 0)
+    if(StrideA < 0)
        StrideA = fetch_leading_stride(a_m_k, ALayout{});
-    if(StrideB <= 0)
+    if(StrideB < 0)
        StrideB = fetch_leading_stride(b_k_n, BLayout{});
-    if(StrideD0 <= 0)
+    if(StrideD0 < 0)
        StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{});
-    if(StrideD1 <= 0)
+    if(StrideD1 < 0)
        StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{});
-    if(StrideE <= 0)
+    if(StrideE < 0)
        StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{});

    switch(config.init_method)
--- a/example/09_convnd_fwd/README.md
+++ b/example/09_convnd_fwd/README.md
@@ -1,6 +1,42 @@
-# Instructions for ```example_convnd_fwd_xdl```
+# N-Dimensional Convolution Forward
+
+## Theory
+
+This example demonstrates the **N-dimensional convolution forward pass** using Composable Kernel. Convolution is a fundamental operation in deep learning, especially in convolutional neural networks (CNNs) for images, audio, and volumetric data.
+
+**Mathematical Formulation:**
+Given:
+- Input tensor: $X[N, C_{in}, D_1, D_2, ..., D_n]$
+- Weight tensor: $W[C_{out}, C_{in}, K_1, K_2, ..., K_n]$
+- Output tensor: $Y[N, C_{out}, O_1, O_2, ..., O_n]$
+
+The convolution computes:
+$$
+Y[n, c_{out}, o_1, ..., o_n] = \sum_{c_{in}} \sum_{k_1} ... \sum_{k_n} X[n, c_{in}, o_1 + k_1, ..., o_n + k_n] \cdot W[c_{out}, c_{in}, k_1, ..., k_n]
+$$
+
+Stride, padding, and dilation parameters control the mapping between input and output indices.
+
+**Algorithmic Background:**
+- Composable Kernel implements convolution as an implicit GEMM (matrix multiplication) for efficiency.
+- The input and weight tensors are transformed into matrices, and the convolution is performed as a GEMM.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/09_convnd_fwd
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+```
+
+### Run ```example_convnd_fwd_xdl```

-## Run ```example_convnd_fwd_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -16,3 +52,29 @@
 # <right padding>, (ie RightPy, RightPx for 2D)
 ./bin/example_convnd_fwd_xdl 0 1 100
 ```
+## Source Code Structure
+
+### Directory Layout
+```
+example/09_convnd_fwd/
+├── convnd_fwd_xdl.cpp         # Main example: sets up, runs, and verifies N-D convolution
+include/ck/tensor_operation/gpu/device/
+│   └── device_convnd_fwd.hpp       # Device-level convolution API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_convnd_fwd_xdl.hpp   # XDL-based convolution implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_convnd_fwd_xdl.hpp # Grid-level convolution kernel
+include/ck/tensor_operation/gpu/block/
+    └── blockwise_convnd_fwd_xdl.hpp # Block-level convolution
+```
+
+### Key Classes and Functions
+
+- **DeviceConvNdFwd** (in `device_convnd_fwd.hpp`):  
+  Device API for N-dimensional convolution.
+- **gridwise_convnd_fwd_xdl** (in `gridwise_convnd_fwd_xdl.hpp`):  
+  Implements the tiled/blocking convolution kernel.
+- **blockwise_convnd_fwd_xdl** (in `blockwise_convnd_fwd_xdl.hpp`):  
+  Handles block-level computation and shared memory tiling.
+
+This example demonstrates how Composable Kernel implements efficient N-dimensional convolution using implicit GEMM, supporting a wide range of deep learning applications.
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/README.md
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/README.md
@@ -0,0 +1,57 @@
+# N-Dimensional Convolution with Multiple D and Multiple Reduce
+
+## Theory
+
+This example demonstrates **N-dimensional convolution forward** with support for multiple auxiliary tensors (D) and multiple reduction operations. This is useful for advanced neural network layers that require additional outputs or statistics alongside the main convolution result.
+
+**Mathematical Formulation:**
+- Input tensor: $X[N, C_{in}, D_1, D_2, ..., D_n]$
+- Weight tensor: $W[C_{out}, C_{in}, K_1, K_2, ..., K_n]$
+- Auxiliary tensors: $D_0, D_1, ...$ (various shapes)
+- Output tensor: $Y[N, C_{out}, O_1, O_2, ..., O_n]$
+- Reduction operations: e.g., sum, mean, max over specified axes
+
+The convolution computes the standard output as well as additional outputs or statistics by applying reduction operations to the convolution result or auxiliary tensors.
+
+**Algorithmic Background:**
+- Composable Kernel implements this as an implicit GEMM with support for multiple auxiliary tensors and reductions in the epilogue.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./convnd_fwd_multiple_d_multiple_reduce_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/10_convnd_fwd_multiple_d_multiple_reduce/
+├── convnd_fwd_multiple_d_multiple_reduce_xdl.cpp   # Main example: sets up, runs, and verifies N-D convolution with multiple D/reduce
+include/ck/tensor_operation/gpu/device/
+│   └── device_convnd_fwd_multiple_d_multiple_reduce.hpp   # Device-level API for multi-D/multi-reduce convolution
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_convnd_fwd_multiple_d_multiple_reduce_impl.hpp # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_convnd_fwd_multiple_d_multiple_reduce.hpp # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceConvNdFwdMultipleDMultipleReduce** (in `device_convnd_fwd_multiple_d_multiple_reduce.hpp`):  
+  Device API for N-dimensional convolution with multiple outputs and reductions.
+- **gridwise_convnd_fwd_multiple_d_multiple_reduce** (in `gridwise_convnd_fwd_multiple_d_multiple_reduce.hpp`):  
+  Implements the tiled/blocking convolution kernel with multi-output/reduce epilogue.
+
+This example demonstrates how Composable Kernel supports advanced convolution patterns with multiple outputs and reductions in a single efficient kernel.
--- a/example/11_convnd_fwd_bias/README.md
+++ b/example/11_convnd_fwd_bias/README.md
@@ -0,0 +1,57 @@
+# N-Dimensional Convolution Forward with Bias
+
+## Theory
+
+This example demonstrates **N-dimensional convolution forward** with bias addition. This is a common pattern in convolutional neural networks (CNNs), where a bias term is added to each output channel after the convolution operation.
+
+**Mathematical Formulation:**
+$$
+Y[n, c_{out}, o_1, ..., o_n] = \sum_{c_{in}} \sum_{k_1} ... \sum_{k_n} X[n, c_{in}, o_1 + k_1, ..., o_n + k_n] \cdot W[c_{out}, c_{in}, k_1, ..., k_n] + B[c_{out}]
+$$
+- $X$: [N, C_in, D1, D2, ..., Dn] input tensor
+- $W$: [C_out, C_in, K1, K2, ..., Kn] weight tensor
+- $B$: [C_out] bias tensor
+- $Y$: [N, C_out, O1, O2, ..., On] output tensor
+
+**Algorithmic Background:**
+- Composable Kernel implements convolution as an implicit GEMM, with bias addition fused in the epilogue for efficiency.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/11_convnd_fwd_bias
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./convnd_fwd_bias_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/11_convnd_fwd_bias/
+├── convnd_fwd_bias_xdl.cpp         # Main example: sets up, runs, and verifies N-D convolution with bias
+include/ck/tensor_operation/gpu/device/
+│   └── device_convnd_fwd_bias.hpp       # Device-level convolution API with bias
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_convnd_fwd_bias_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_convnd_fwd_bias.hpp     # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceConvNdFwdBias** (in `device_convnd_fwd_bias.hpp`):  
+  Device API for N-dimensional convolution with bias.
+- **gridwise_convnd_fwd_bias** (in `gridwise_convnd_fwd_bias.hpp`):  
+  Implements the tiled/blocking convolution kernel with bias epilogue.
+
+This example demonstrates how Composable Kernel fuses bias addition into the convolution forward pass for efficient CNN layer implementation.
--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
@@ -1,6 +1,38 @@
-# Instructions for ```example_reduce_blockwise```
+# Parallel Reduction Operations
+
+## Theory
+
+This example demonstrates **parallel reduction operations** (e.g., sum, max, min, mean) over tensors. Reduction is a fundamental operation in deep learning for computing statistics (such as batch mean/variance), loss aggregation, and normalization.
+
+**Mathematical Formulation:**
+Given a tensor $X$ and a reduction axis $a$:
+$$
+Y = \text{reduce}_{a}(X)
+$$
+- For sum: $Y = \sum_{i \in a} X_i$
+- For max: $Y = \max_{i \in a} X_i$
+- For mean: $Y = \frac{1}{|a|} \sum_{i \in a} X_i$
+
+**Algorithmic Background:**
+- Reductions are implemented using parallel tree reduction or segmented reduction algorithms.
+- Efficient reductions require careful memory access, synchronization, and sometimes numerically stable algorithms (e.g., Welford's for variance).
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/12_reduce
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+```

 ## Run ```example_reduce_blockwise```
+
 ```bash
 # -D <xxx> : input 3D/4D/5D tensor lengths
 # -R <xxx> : reduce dimension ids
@@ -11,7 +43,8 @@
 ./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
 ```

-Result
+Expected Result:
+
 ```
 ./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
 launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
@@ -21,6 +54,7 @@ Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
 ```

 ## Run ```example_reduce_multiblock_atomic_add```
+
 ```bash
 # -D <xxx> : input 3D/4D/5D tensor lengths
 # -R <xxx> : reduce dimension ids
@@ -31,7 +65,7 @@ Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
 ./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
 ```

-Result
+Expected Result
 ```
 ./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
 Perf: 0 ms, inf GB/s, DeviceReduceMultiBlock<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
@@ -42,6 +76,7 @@ echo $?
 # Instructions for ```example_reduce_blockwise_two_call```

 ## Run ```example_reduce_blockwise_two_call```
+
 ```bash
 #arg1:  verification (0=no, 1=yes(
 #arg2:  initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
@@ -49,7 +84,8 @@ echo $?
 ./bin/example_reduce_blockwise_two_call 1 2 1
 ```

-Result
+Expected Result:
+
 ```
 ./bin/example_reduce_blockwise_two_call 1 2 1
 launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
@@ -60,3 +96,30 @@ Warm up 1 time
 Start running 10 times...
 Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/12_reduce/
+├── reduce_xdl.cpp         # Main example: sets up, runs, and verifies reduction
+include/ck/tensor_operation/gpu/device/
+│   └── device_reduce.hpp       # Device-level reduction API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_reduce_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_reduce.hpp     # Grid-level reduction kernel
+include/ck/tensor_operation/gpu/block/
+    └── blockwise_reduce.hpp    # Block-level reduction
+```
+
+### Key Classes and Functions
+
+- **DeviceReduce** (in `device_reduce.hpp`):  
+  Device API for reductions.
+- **gridwise_reduce** (in `gridwise_reduce.hpp`):  
+  Implements the tiled/blocking reduction kernel.
+- **blockwise_reduce** (in `blockwise_reduce.hpp`):  
+  Handles block-level reduction and shared memory.
+
+This example demonstrates how Composable Kernel implements efficient parallel reductions for deep learning and scientific computing.
--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
@@ -1,6 +1,41 @@
-# Instructions for ```example_pool2d_fwd``` Examples
+# 2D Pooling Forward
+
+## Theory
+
+This example demonstrates the **2D pooling forward pass**, a key operation in convolutional neural networks (CNNs) for spatial downsampling. Pooling reduces the spatial dimensions of feature maps, providing translation invariance and reducing computation.
+
+**Mathematical Formulation:**
+Given input $X[N, C, H_{in}, W_{in}]$, pooling window $(k_H, k_W)$, stride $(s_H, s_W)$, and padding $(p_H, p_W)$:
+- Output $Y[N, C, H_{out}, W_{out}]$
+- $H_{out} = \left\lfloor \frac{H_{in} + 2p_H - k_H}{s_H} \right\rfloor + 1$
+- $W_{out} = \left\lfloor \frac{W_{in} + 2p_W - k_W}{s_W} \right\rfloor + 1$
+
+For each output position:
+- **Max Pooling:** $Y_{n,c,h,w} = \max_{i,j} X_{n,c,h \cdot s_H + i, w \cdot s_W + j}$
+- **Average Pooling:** $Y_{n,c,h,w} = \frac{1}{k_H k_W} \sum_{i,j} X_{n,c,h \cdot s_H + i, w \cdot s_W + j}$
+
+**Algorithmic Background:**
+- Each thread computes one or more output elements.
+- Handles padding and boundary conditions.
+- Optimizes memory access for bandwidth.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/13_pool2d_fwd
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+```
+
+### Run ```example_pool2d_fwd_fp16```

-## Run ```example_pool2d_fwd_fp16```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
@@ -9,7 +44,7 @@
 ./bin/example_pool2d_fwd_fp16 1 1 1
 ```

-Result 
+Expected Result: 
 ```
 in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
 out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
@@ -19,7 +54,8 @@ Start running 10 times...
 Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
 ```

-## Run ```example_pool2d_fwd_fp32```
+### Run ```example_pool2d_fwd_fp32```
+
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
@@ -29,8 +65,9 @@ Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
 ```


-Result 
-```
+Expected Result: 
+
+```bash
 ./bin/example_pool2d_fwd_fp32 1 1 1
 in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
 out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
@@ -39,3 +76,31 @@ Warm up 1 time
 Start running 10 times...
 Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/13_pool2d_fwd/
+├── pool2d_fwd_xdl.cpp         # Main example: sets up, runs, and verifies 2D pooling
+include/ck/tensor_operation/gpu/device/
+│   └── device_pool_fwd.hpp       # Device-level pooling API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_pool2d_fwd_nhwc.hpp # NHWC layout optimization
+│   └── device_pool2d_fwd_nchw.hpp # NCHW layout optimization
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_pool_fwd.hpp     # Grid-level pooling kernel
+include/ck/tensor_operation/gpu/block/
+    └── blockwise_pool.hpp        # Block-level pooling
+```
+
+### Key Classes and Functions
+
+- **DevicePoolFwd** (in `device_pool_fwd.hpp`):  
+  Device API for pooling.
+- **gridwise_pool_fwd** (in `gridwise_pool_fwd.hpp`):  
+  Implements the tiled/blocking pooling kernel.
+- **blockwise_pool** (in `blockwise_pool.hpp`):  
+  Handles block-level pooling and shared memory.
+
+This example demonstrates how Composable Kernel implements efficient 2D pooling for CNNs and vision models.
--- a/example/14_gemm_quantization/README.md
+++ b/example/14_gemm_quantization/README.md
@@ -0,0 +1,60 @@
+# GEMM with Quantization
+
+## Theory
+
+This example demonstrates **GEMM (General Matrix Multiplication) with quantized inputs or weights**. Quantization is a technique to reduce memory and computation by representing values with lower-precision integer types (e.g., int8), commonly used for efficient inference in deep learning.
+
+**Mathematical Formulation:**
+- Quantized GEMM: $C = \text{dequant}(A_q) \times \text{dequant}(B_q)$
+- $A_q$, $B_q$: quantized matrices (e.g., int8)
+- $\text{dequant}(x_q) = (x_q - z) \cdot s$ (scale $s$, zero-point $z$)
+- $C$: output matrix (often in higher precision, e.g., float32 or float16)
+
+**Algorithmic Background:**
+- Quantized values are dequantized on-the-fly during GEMM computation.
+- Accumulation is performed in higher precision for accuracy.
+- Supports symmetric and asymmetric quantization.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/14_gemm_quantization
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_quantization_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/14_gemm_quantization/
+├── gemm_quantization_xdl.cpp         # Main example: sets up, runs, and verifies quantized GEMM
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_quantized.hpp       # Device-level quantized GEMM API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_quantized_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_quantized.hpp     # Grid-level quantized GEMM kernel
+include/ck/tensor_operation/gpu/element/
+    └── quantization_operations.hpp     # Quantization/dequantization utilities
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmQuantized** (in `device_gemm_quantized.hpp`):  
+  Device API for quantized GEMM.
+- **gridwise_gemm_quantized** (in `gridwise_gemm_quantized.hpp`):  
+  Implements the tiled/blocking quantized GEMM kernel.
+- **quantization_operations** (in `quantization_operations.hpp`):  
+  Defines quantization and dequantization functions.
+
+This example demonstrates how Composable Kernel supports efficient quantized matrix multiplication for deep learning inference.
--- a/example/15_grouped_gemm/README.md
+++ b/example/15_grouped_gemm/README.md
@@ -1,9 +1,64 @@
-# Instructions for ```example_grouped_gemm_xdl```
+# Grouped GEMM
+
+## Theory
+
+This example demonstrates **grouped GEMM**: performing multiple independent GEMM operations (with potentially different shapes) in a single kernel launch. Grouped GEMM is used in transformer models (e.g., multi-head attention), mixture-of-experts, and other architectures requiring heterogeneous batched matrix multiplications.
+
+**Mathematical Formulation:**
+For $G$ groups, each with its own $A_g$, $B_g$, $C_g$:
+$$
+C_g = A_g \times B_g \quad \text{for} \quad g = 1, 2, ..., G
+$$
+- $A_g$: [M_g, K_g] input matrix for group $g$
+- $B_g$: [K_g, N_g] weight matrix for group $g$
+- $C_g$: [M_g, N_g] output matrix for group $g$
+
+**Algorithmic Background:**
+- Each group can have different matrix sizes and strides.
+- The kernel launches a grid covering all groups, with each block assigned to a group.
+- Useful for variable-length sequences, multi-head attention, and expert routing.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/15_grouped_gemm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+```
+
+### Run ```example_grouped_gemm_xdl```

-## Run ```example_grouped_gemm_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 ./bin/example_grouped_gemm_xdl_fp16 0 1 5
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/15_grouped_gemm/
+├── grouped_gemm_xdl.cpp         # Main example: sets up, runs, and verifies grouped GEMM
+include/ck/tensor_operation/gpu/device/
+│   └── device_grouped_gemm_xdl.hpp       # Device-level grouped GEMM API
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_grouped_gemm_xdl.hpp     # Grid-level grouped GEMM kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceGroupedGemmXdl** (in `device_grouped_gemm_xdl.hpp`):  
+  Device API for grouped GEMM.
+- **gridwise_grouped_gemm_xdl** (in `gridwise_grouped_gemm_xdl.hpp`):  
+  Implements the tiled/blocking grouped GEMM kernel.
+
+This example demonstrates how Composable Kernel supports efficient heterogeneous batched matrix multiplication for advanced AI/ML workloads.
--- a/example/16_gemm_multi_d_multi_reduces/README.md
+++ b/example/16_gemm_multi_d_multi_reduces/README.md
@@ -0,0 +1,56 @@
+# GEMM with Multiple D and Multiple Reductions
+
+## Theory
+
+This example demonstrates **GEMM with multiple auxiliary tensors (D) and multiple reduction operations**. This pattern is used in advanced neural network layers that require additional outputs or statistics (such as sums, means, or other reductions) alongside the main GEMM result.
+
+**Mathematical Formulation:**
+- For each GEMM: $C = A \times B$
+- Auxiliary tensors: $D_0, D_1, ...$ (various shapes)
+- Reductions: e.g., sum, mean, max over specified axes or outputs
+
+The kernel computes the main GEMM output and additional reductions or statistics in a single pass.
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, auxiliary tensors are fused in the epilogue, and reductions are computed as part of the output.
+- Useful for multi-task learning, attention statistics, and custom neural network layers.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/16_gemm_multi_d_multi_reduces
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_multi_d_multi_reduces_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/16_gemm_multi_d_multi_reduces/
+├── gemm_multi_d_multi_reduces_xdl.cpp         # Main example: sets up, runs, and verifies GEMM with multi-D/multi-reduce
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_multi_d_multi_reduces.hpp       # Device-level API for multi-D/multi-reduce GEMM
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_multi_d_multi_reduces_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_gemm_multi_d_multi_reduces.hpp     # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmMultiDMultiReduces** (in `device_gemm_multi_d_multi_reduces.hpp`):  
+  Device API for GEMM with multiple outputs and reductions.
+- **gridwise_gemm_multi_d_multi_reduces** (in `gridwise_gemm_multi_d_multi_reduces.hpp`):  
+  Implements the tiled/blocking GEMM kernel with multi-output/reduce epilogue.
+
+This example demonstrates how Composable Kernel supports advanced GEMM patterns with multiple outputs and reductions in a single efficient kernel.
--- a/example/17_convnd_bwd_data/README.md
+++ b/example/17_convnd_bwd_data/README.md
@@ -1,6 +1,62 @@
-# Instructions for ```example_convnd_bwd_data_xdl```
+# N-Dimensional Convolution Backward Pass for Data

-## Run ```example_example_convnd_bwd_data_xdl```
+This example demonstrates the backward data pass of an N-dimensional convolution, often denoted as `conv_bwd_data`. This operation is a crucial part of the backpropagation algorithm for training Convolutional Neural Networks (CNNs). Its purpose is to compute the gradient of the loss function with respect to the convolution's *input data*, which is then passed back to the preceding layer in the network.
+
+## Mathematical Formulation
+
+The backward data pass computes the gradient $\frac{\partial L}{\partial \text{In}}$, given the gradient from the subsequent layer, $\frac{\partial L}{\partial \text{Out}}$, and the filter weights `W` used in the forward pass.
+
+Let the forward convolution be defined as:
+$\text{Out} = \text{In} \star W$
+
+The backward data pass is mathematically equivalent to a "full" convolution between the output gradient tensor `dL/dOut` and the 180-degree rotated (or transposed and flipped) weight tensor `W`.
+
+$\frac{\partial L}{\partial \text{In}} = \frac{\partial L}{\partial \text{Out}} \star \text{rot180}(W)$
+
+This operation propagates the error signal from the output back to the input, weighted by the same filters that were used in the forward pass.
+
+## Algorithmic Strategy: Implicit GEMM
+
+As with the forward pass, the most efficient way to implement the backward data pass on a GPU is to transform the convolution into a General Matrix-Matrix Multiplication (GEMM) problem.
+
+1.  **Output Gradient Reshaping**: The output gradient tensor `dL/dOut` is logically reshaped into a matrix `dL/dOut'` of shape `[K, (N*Ho*Wo)]`. This becomes the "A" matrix in the GEMM.
+
+2.  **Weight Reshaping**: The weight tensor `W` is logically reshaped into a matrix `W'` of shape `[K, (C*Y*X)]`. This becomes the "B" matrix in the GEMM.
+
+3.  **Implicit GEMM**: The core computation is then formulated as a GEMM operation. However, the output of this GEMM is not a simple matrix; it's the `dL/dIn` tensor.
+    $(\text{dL/dIn})' = (W')^T \times (\text{dL/dOut})'$
+
+    The key insight is that this operation can be performed without explicitly forming the matrices. The GEMM kernel is designed to read from `dL/dOut` and `W` and write its results directly to the appropriate locations in the `dL/dIn` tensor. This process is sometimes referred to as an "implicit `col2im`" (column-to-image), as it is the inverse of the `im2col` transformation used in the forward pass.
+
+This "implicit GEMM" approach is highly efficient. It avoids the massive memory and bandwidth overhead of materializing intermediate matrices, which is critical for performance.
+
+## Source Code Organization
+
+-   [`conv_bwd_data_xdl.cpp`](./conv_bwd_data_xdl.cpp): The main example file that defines the parameters for a 2D convolution and instantiates the generic `DeviceConvNdBwdData` kernel to compute the input gradients.
+-   [`../../include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp`](../../include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp): The high-level device interface for the backward data convolution. It is templated on the dimensionality, layouts, and data types of the problem.
+-   [`../../include/ck/tensor_operation/gpu/grid/gridwise_gemm_implicit_gemm_v1r2_xdlops_nchw_kcyx_nkhw.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_gemm_implicit_gemm_v1r2_xdlops_nchw_kcyx_nkhw.hpp): An example of a specific grid-wise kernel that implements the implicit GEMM algorithm for the backward data pass. The library contains multiple such kernels optimized for different layouts and problem types, and the `DeviceConvNdBwdData` interface selects the most appropriate one.
+-   [`../../library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp`](../../library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp): A CPU reference implementation used to verify the correctness of the GPU kernel's output.
+
+## Build and Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/17_convnd_bwd_data
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -45,3 +101,16 @@ Warm up
 Start running 1 times...
 Perf: 1.40031 ms, 69.8734 TFlops, 179.037 GB/s
 ```
+
+## Relationship to Other Passes
+
+The training of a single convolutional layer requires three distinct steps:
+
+1.  **Forward Pass (`conv_fwd`)**: Computes the output feature maps.
+    -   `Out = In * W`
+2.  **Backward Data Pass (`conv_bwd_data`)**: Computes the gradient with respect to the input, propagating the error to the previous layer. This is the focus of the current example.
+    -   `dL/dIn = dL/dOut * rot180(W)`
+3.  **Backward Weight Pass (`conv_bwd_weight`)**: Computes the gradient with respect to the weights, which is needed for the weight update.
+    -   `dL/dW = In * dL/dOut`
+
+All three passes are critical for training a CNN, and all are typically implemented as high-performance implicit GEMM operations.
--- a/example/18_batched_gemm_reduce/README.md
+++ b/example/18_batched_gemm_reduce/README.md
@@ -0,0 +1,78 @@
+# Batched GEMM with Reduction
+
+This example demonstrates a Batched General Matrix-Matrix Multiplication (Batched GEMM) where the result of each individual GEMM in the batch is then reduced along one of its dimensions. This is a specialized fusion pattern that combines a compute-intensive operation (GEMM) with a memory-intensive one (reduction), offering significant performance benefits for specific workloads.
+
+## Mathematical Formulation
+
+The operation performs a standard GEMM for each item in a batch, and then reduces the resulting matrix to a vector. For each batch item `b` from `0` to `BatchCount-1`:
+
+1.  **GEMM Stage**: A standard matrix multiplication is performed.
+    $C_{[b]} = A_{[b]} \times B_{[b]}$
+
+2.  **Reduction Stage**: The resulting matrix $C_{[b]}$ is reduced along one of its dimensions (e.g., the M dimension) to produce an output vector $D_{[b]}$.
+    $D_{[b], j} = \bigoplus_{i=0}^{M-1} C_{[b], i, j}$
+
+Where:
+-   $A_{[b]}$ is an $M \times K$ matrix.
+-   $B_{[b]}$ is a $K \times N$ matrix.
+-   $C_{[b]}$ is the intermediate $M \times N$ result matrix for batch `b`.
+-   $D_{[b]}$ is the final $1 \times N$ output vector for batch `b`.
+-   $\bigoplus$ is a binary, associative reduction operator like sum, max, or min.
+
+The key optimization is that the intermediate matrix $C_{[b]}$ is never written to global memory. The reduction is fused directly into the GEMM kernel.
+
+## Algorithmic Strategy: Fused GEMM and Reduction
+
+The implementation fuses the reduction into the epilogue of a batched GEMM kernel. The batch dimension provides a natural axis for parallelism.
+
+1.  **Batch Scheduling**: The `BatchCount` GEMM problems are distributed across the GPU's thread blocks. Each block is assigned one or more GEMMs from the batch to compute.
+
+2.  **Tiled GEMM Core**: For each assigned GEMM, the thread block runs a standard tiled GEMM algorithm to compute the product $A_{[b]} \times B_{[b]}$. The result for each tile of $C_{[b]}$ is accumulated in the private registers of the threads.
+
+3.  **Fused Reduction Epilogue**: This is where the fusion occurs. Instead of writing the computed tile of $C_{[b]}$ to global memory, the threads use it as input for a parallel reduction.
+    -   **Intra-Block Reduction**: The threads within a block, which collectively hold the values for a tile of $C_{[b]}$, perform a local reduction. For example, to reduce along the M dimension, threads responsible for different M-rows but the same N-column will cooperate, using fast shared memory to sum their partial results.
+    -   **Inter-Block Reduction**: Since multiple thread blocks may be working on different M-tiles for the same batch item, their partial reduction results must be combined. Each block writes its partial sum to a designated location in the output vector `D`, using atomic operations (like `atomicAdd`) to safely accumulate the final result.
+
+This strategy completely eliminates the global memory traffic associated with the intermediate matrix `C`, which is often the largest tensor in the operation. This leads to substantial savings in memory bandwidth and improved performance.
+
+## Source Code Organization
+
+-   [`batched_gemm_reduce_xdl.cpp`](./batched_gemm_reduce_xdl.cpp): The main example file. It sets up the batched GEMM problem and instantiates the `DeviceBatchedGemmReduce` operation, specifying the reduction dimension and operator.
+-   [`../../include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce.hpp`](../../include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce.hpp): The high-level device interface for this fused operation.
+-   [`../../include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_reduce_xdl_cshuffle.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_reduce_xdl_cshuffle.hpp): The grid-wise kernel that implements the fused logic. It handles the batch scheduling, the tiled GEMM, and the fused reduction epilogue with atomic operations for inter-block communication.
+
+## Build and Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/18_batched_gemm_reduce
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./batched_gemm_reduce_xdl
+
+# Run with verification, data initialization, and timing
+./batched_gemm_reduce_xdl 1 2 1
+```
+
+## Applications
+
+This fused pattern is less common than simple GEMM+Bias but is highly effective for specific algorithms.
+
+-   **Gradient Computations**: In some complex neural network layers, the gradient calculation might involve a matrix product followed by a summation. For example, computing the gradient with respect to a bias term often involves summing the output gradients over the batch and spatial dimensions. If the output gradient itself is the result of a GEMM, this fused kernel could be applicable.
+-   **Custom Attention Mechanisms**: While standard attention involves a `softmax`, some research explores attention-like mechanisms that might use a simple sum or max reduction instead. If the query-key interaction is formulated as a batched GEMM, this kernel could compute the attention weights in a single, fused step.
+-   **Scientific Computing**: Certain numerical methods, particularly in physics or signal processing, may involve performing a linear transform (GEMM) on a set of signals (a batch) and then integrating the result (a reduction).
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -25,8 +25,9 @@ using S = ck::Sequence<Is...>;
 using F16 = ck::half_t;
 using F32 = float;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using ADataType         = F16;
 using BDataType         = F16;
@@ -138,12 +139,12 @@ int main(int argc, char* argv[])
        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
            return HostTensorDescriptor(
-                {batch_count, row, col}, {row * stride, stride, 1_uz}, layout);
+                {batch_count, row, col}, {row * stride, stride, 1_uz}, Bypass{});
        }
        else
        {
            return HostTensorDescriptor(
-                {batch_count, row, col}, {col * stride, 1_uz, stride}, layout);
+                {batch_count, row, col}, {col * stride, 1_uz, stride}, Bypass{});
        }
    };

--- a/example/19_binary_elementwise/README.md
+++ b/example/19_binary_elementwise/README.md
@@ -0,0 +1,84 @@
+# Binary Elementwise Operations with Broadcasting
+
+This example demonstrates a generic binary elementwise operation, a fundamental building block in numerical computing. It covers two important cases:
+1.  **Simple Elementwise**: Applying a binary function to two input tensors of the *same* shape.
+2.  **Elementwise with Broadcasting**: Applying a binary function to two input tensors of *different but compatible* shapes.
+
+Broadcasting defines a set of rules for applying elementwise operations on tensors of different sizes, and it is a cornerstone of libraries like NumPy and TensorFlow.
+
+## Mathematical Formulation
+
+### Simple Elementwise
+Given two input tensors, A and B, of the same rank and dimensions, and a binary operator $\odot$, the operation computes an output tensor C where each element is:
+
+$C_{i,j,k,\dots} = A_{i,j,k,\dots} \odot B_{i,j,k,\dots}$
+
+### Elementwise with Broadcasting
+Broadcasting allows elementwise operations on tensors with different shapes, provided they are compatible. Two dimensions are compatible if they are equal, or if one of them is 1. The operation implicitly "stretches" or "duplicates" the tensor with the dimension of size 1 to match the other tensor's shape.
+
+For example, adding a bias vector `B` of shape `(1, N)` to a matrix `A` of shape `(M, N)`:
+$C_{i,j} = A_{i,j} + B_{0,j}$
+
+Here, the single row of `B` is broadcast across all `M` rows of `A`. The output tensor `C` has the shape `(M, N)`.
+
+Common binary elementwise operations include addition, subtraction, multiplication (Hadamard product), division, max, and min.
+
+## Algorithmic Strategy: Grid-Stride Loop with Broadcasting
+
+The implementation for both cases relies on the efficient **grid-stride loop**, which is adapted to handle broadcasting.
+
+1.  **Grid Partitioning**: The problem is mapped to a 1D grid of threads based on the number of elements in the **output** tensor.
+
+2.  **Grid-Stride Loop**: Each thread iterates through a subset of the output elements. For each output index, it must calculate the corresponding indices into the input tensors A and B.
+
+3.  **Broadcasting Logic**:
+-   The core of the broadcasting logic lies in the `get_broadcast_coord` function. If an input tensor's dimension is 1, the coordinate for that dimension is always set to 0, effectively reusing the same element across the broadcast dimension. If the dimension matches the output, the coordinate is passed through.
+-   This strategy ensures that memory accesses to the larger tensor remain coalesced, while accesses to the smaller, broadcasted tensor will naturally involve re-reading the same values, which is efficiently handled by the GPU's cache hierarchy.
+
+Like the simple case, broadcasted elementwise operations are almost always memory-bandwidth-bound.
+
+## Source Code Organization
+
+This example contains multiple files to demonstrate different scenarios:
+
+-   [`binary_elementwise_xdl.cpp`](./binary_elementwise_xdl.cpp): Demonstrates the simple case where both input tensors have the same shape.
+-   [`broadcast_add_2d_amn_bn.cpp`](./broadcast_add_2d_amn_bn.cpp): A specific example of broadcasting, adding a tensor of shape `(B, N)` to a tensor of shape `(A, M, N)`.
+-   [`../../include/ck/tensor_operation/gpu/device/device_elementwise.hpp`](../../include/ck/tensor_operation/gpu/device/device_elementwise.hpp): The high-level device interface. It is generic enough to handle both simple and broadcasted operations by correctly interpreting the tensor descriptors, which contain shape and stride information.
+-   [`../../include/ck/tensor_operation/gpu/grid/gridwise_elementwise.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_elementwise.hpp): The grid-wise kernel that implements the grid-stride loop. The tensor coordinate logic within this kernel correctly handles broadcasting based on the provided tensor descriptors.
+-   [`../../include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp`](../../include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp): Defines the various binary operator functors (like `Add`, `Multiply`, etc.).
+
+## Build and Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/19_binary_elementwise
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the simple elementwise example
+./binary_elementwise_xdl 1 2 1
+
+# Run the broadcasting example
+./broadcast_add_2d_amn_bn 1 2 1
+```
+
+## Applications
+
+Broadcasting is a powerful feature that makes code more concise and memory-efficient.
+-   **Adding Bias**: The most common use case in deep learning is adding a bias vector (shape `[N]`) to a matrix of activations (shape `[Batch, N]`).
+-   **Feature Scaling**: Multiplying a feature map (shape `[N, C, H, W]`) by a per-channel scaling factor (shape `[1, C, 1, 1]`).
+-   **Standardization**: In data preprocessing, subtracting the mean (a vector) and dividing by the standard deviation (another vector) from a data matrix.
+-   **Coordinate Grids**: Creating coordinate grids by adding a row vector `[0, 1, 2...]` to a column vector `[0, 1, 2...]^T`.
--- a/example/20_grouped_conv_bwd_weight/README.md
+++ b/example/20_grouped_conv_bwd_weight/README.md
@@ -0,0 +1,77 @@
+# Grouped Convolution Backward Pass for Weights
+
+This example demonstrates the backward weight pass for a **grouped convolution**, often denoted as `grouped_conv_bwd_weight`. This operation is essential for training neural networks that use grouped or depthwise convolutions, such as ResNeXt, MobileNets, and EfficientNets. Its purpose is to compute the gradient of the loss function with respect to the convolution's *filter weights*, which is then used by an optimizer (like SGD or Adam) to update the model's parameters.
+
+## Mathematical Formulation
+
+The backward weight pass computes the gradient $\frac{\partial L}{\partial W}$, given the input tensor from the forward pass, `In`, and the gradient from the subsequent layer, `dL/dOut`.
+
+For a single group `g`, the operation is mathematically equivalent to a convolution between the input tensor for that group, `In_[g]`, and the output gradient tensor for that group, `dL/dOut_[g]`.
+
+$\frac{\partial L}{\partial W_{[g]}} = \text{In}_{[g]} \star \frac{\partial L}{\partial \text{Out}_{[g]}}$
+
+This operation correlates the input activations with the output error signals to determine how each weight should be adjusted to reduce the overall loss. The total gradient `dL/dW` is the collection of gradients for all `G` groups.
+
+## Algorithmic Strategy: Implicit Grouped GEMM
+
+This operation is a perfect candidate for the **Grouped GEMM** primitive. The convolution for each of the `G` groups is independently transformed into a GEMM problem, and all `G` GEMMs are executed in a single kernel launch.
+
+For each group `g`:
+
+1.  **Input to Columns (`im2col`)**: The input tensor `In_[g]` is logically unrolled into a matrix `In'_[g]`. This is the same `im2col` transformation used in the forward pass. This matrix becomes the "A" matrix in the GEMM.
+
+2.  **Output Gradient Reshaping**: The output gradient tensor `dL/dOut_[g]` is logically reshaped into a matrix `(dL/dOut)'_[g]`. This matrix becomes the "B" matrix in the GEMM.
+
+3.  **Implicit Grouped GEMM**: The weight gradient `dL/dW_[g]` is computed by a single GEMM:
+    $(\text{dL/dW})'_{[g]} = (\text{dL/dOut})'_{[g]} \times (\text{In}'_{[g]})^T$
+
+The key to performance is that this is executed as a **Grouped GEMM**. The `DeviceGroupedConvBwdWeight` interface takes the `G` independent problems and maps them to a `DeviceGroupedGemm` kernel. This kernel schedules the `G` independent GEMMs across the GPU's compute units. The `im2col` transformation is performed implicitly; the GEMM kernel reads data directly from the original `In` and `dL/dOut` tensors in the correct pattern, avoiding the materialization of large intermediate matrices.
+
+This approach is highly efficient as it leverages the task-parallel nature of the grouped convolution and the computational efficiency of highly optimized GEMM kernels.
+
+## Source Code Organization
+
+-   [`grouped_conv_bwd_weight_xdl.cpp`](./grouped_conv_bwd_weight_xdl.cpp): The main example file. It sets up a grouped convolution problem and instantiates the `DeviceGroupedConvBwdWeight` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp`](../../include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp): The high-level device interface. It internally translates the grouped convolution problem into a set of arguments for the `DeviceGroupedGemm` interface.
+-   [`../../include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp`](../../include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp): The underlying Grouped GEMM device interface that is called by the grouped convolution operator.
+-   [`../../library/include/ck/library/reference_tensor_operation/cpu/reference_grouped_conv_bwd_weight.hpp`](../../library/include/ck/library/reference_tensor_operation/cpu/reference_grouped_conv_bwd_weight.hpp): A CPU reference implementation for verifying the correctness of the GPU kernel.
+
+## Build and Run
+
+### Prerequisites
+Ensure the Composable Kernel library is built and installed.
+```bash
+cd /path/to/composable_kernel/build
+make -j install
+```
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/20_grouped_conv_bwd_weight
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./grouped_conv_bwd_weight_xdl
+
+# Run with verification, data initialization, and timing
+./grouped_conv_bwd_weight_xdl 1 2 1
+```
+
+## Importance in Modern CNNs
+
+Grouped and depthwise convolutions are the cornerstone of many efficient, state-of-the-art CNN architectures.
+-   **Parameter Efficiency**: By not connecting every input channel to every output channel, grouped convolutions significantly reduce the number of weights in a layer, leading to smaller and faster models.
+-   **Depthwise Separable Convolutions**: Used in MobileNets, EfficientNets, and Xception, these layers factorize a standard convolution into a depthwise convolution (a grouped convolution with `G = C`) and a pointwise convolution (`1x1` conv). The backward pass for the depthwise part requires an efficient `grouped_conv_bwd_weight` implementation.
+-   **ResNeXt**: This architecture introduced the "cardinality" dimension, which is simply the number of groups in a grouped convolution, demonstrating that increasing the number of groups can be more effective than increasing layer depth or width.
+
+An optimized `grouped_conv_bwd_weight` kernel is therefore not an exotic feature but a critical requirement for training a wide range of modern and efficient deep learning models.
--- a/example/21_gemm_layernorm/README.md
+++ b/example/21_gemm_layernorm/README.md
@@ -0,0 +1,57 @@
+# GEMM with LayerNorm Fusion
+
+## Theory
+
+This example demonstrates **GEMM fused with layer normalization**. This pattern is used in transformer feed-forward networks and other architectures where a linear transformation is followed by normalization for improved training stability.
+
+**Mathematical Formulation:**
+- GEMM: $Y = A \times B$
+- LayerNorm: $\text{LayerNorm}(Y) = \gamma \cdot \frac{Y - \mu}{\sqrt{\sigma^2 + \epsilon}} + \beta$
+  - $\mu$: mean of $Y$ over the normalization axis
+  - $\sigma^2$: variance of $Y$ over the normalization axis
+  - $\gamma$, $\beta$: learnable scale and shift parameters
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, and layer normalization is applied before writing to global memory.
+- LayerNorm is typically applied over the last dimension (features).
+- This fusion reduces memory traffic and is common in transformer MLP blocks.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/21_gemm_layernorm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_layernorm_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/21_gemm_layernorm/
+├── gemm_layernorm_xdl.cpp         # Main example: sets up, runs, and verifies GEMM+LayerNorm
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_layernorm.hpp       # Device-level GEMM+LayerNorm API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_layernorm_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_gemm_layernorm.hpp     # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmLayerNorm** (in `device_gemm_layernorm.hpp`):  
+  Device API for GEMM fused with layer normalization.
+- **gridwise_gemm_layernorm** (in `gridwise_gemm_layernorm.hpp`):  
+  Implements the tiled/blocking GEMM kernel with layer normalization epilogue.
+
+This example demonstrates how Composable Kernel supports efficient fusion of linear and normalization layers for transformer and deep learning models.
--- a/example/22_cgemm/README.md
+++ b/example/22_cgemm/README.md
@@ -0,0 +1,83 @@
+# Complex General Matrix-Matrix Multiplication (CGEMM)
+
+This example demonstrates a General Matrix-Matrix Multiplication for complex-valued tensors (CGEMM). This operation is a fundamental building block in many scientific and engineering domains, including signal processing, quantum computing, and electromagnetics, where computations are naturally expressed using complex numbers.
+
+## Mathematical Formulation
+
+A complex number `z` can be represented as `z = a + bi`, where `a` is the real part and `b` is the imaginary part. The multiplication of two complex numbers `z1 = a + bi` and `z2 = c + di` is:
+
+$z_1 \cdot z_2 = (a+bi)(c+di) = (ac - bd) + (ad + bc)i$
+
+A CGEMM operation, $D = \alpha \cdot (A \times B) + \beta \cdot C$, involves matrices where each element is a complex number. The core matrix multiplication $A \times B$ is defined as:
+
+$C_{ik} = \sum_j A_{ij} \cdot B_{jk}$
+
+Where each multiplication and addition is a complex operation. This can be broken down into four real-valued GEMM operations:
+
+Let $A = A_r + iA_i$ and $B = B_r + iB_i$. Then the product $C = A \times B$ is:
+$C = (A_r + iA_i) \times (B_r + iB_i) = (A_r B_r - A_i B_i) + i(A_r B_i + A_i B_r)$
+
+This shows that one CGEMM can be decomposed into four real GEMMs and two real matrix additions/subtractions.
+
+## Algorithmic Strategy: Fused Complex Arithmetic
+
+A naive implementation would launch six separate real-valued kernels (4 GEMMs, 2 additions). A much more efficient approach, and the one used by Composable Kernel, is to implement CGEMM in a single, fused kernel.
+
+1.  **Data Layout**: Complex numbers are typically stored in an interleaved format, where the real and imaginary parts of an element are adjacent in memory (e.g., `[r1, i1, r2, i2, ...]`). The kernel is designed to work efficiently with this layout.
+
+2.  **Tiled CGEMM**: The kernel uses a standard tiled GEMM algorithm, but the fundamental operations are adapted for complex numbers.
+    -   **Loading**: A thread block loads tiles of the complex-valued matrices A and B from global memory into shared memory.
+    -   **Complex Multiply-Accumulate**: The core of the algorithm is the multiply-accumulate (MAC) operation. Instead of a single `fma` instruction, each complex MAC involves multiple real-valued `fma` instructions to compute the real and imaginary parts of the product, as shown in the mathematical formulation.
+        -   `real_part = (a_r * b_r) - (a_i * b_i)`
+        -   `imag_part = (a_r * b_i) + (a_i * b_r)`
+    -   These operations are carefully scheduled to maximize instruction-level parallelism and hide latency. The accumulators for both the real and imaginary parts are held in private registers.
+
+3.  **Storing**: After the tile is fully computed, the complex-valued result is written from registers back to the output matrix D in global memory.
+
+By fusing the complex arithmetic directly into the GEMM kernel, we avoid launching multiple kernels and storing large intermediate real-valued matrices, which dramatically reduces kernel launch overhead and memory bandwidth requirements.
+
+## Source Code Organization
+
+-   [`cgemm_xdl.cpp`](./cgemm_xdl.cpp): The main example file. It defines complex-valued input matrices and instantiates the `DeviceGemm` operation, specialized for complex data types.
+-   The standard `DeviceGemm` interface from [`../../include/ck/tensor_operation/gpu/device/device_gemm.hpp`](../../include/ck/tensor_operation/gpu/device/device_gemm.hpp) is used. Composable Kernel overloads this interface for complex types (`ck::complex<T>`).
+-   The grid-wise GEMM kernel is specialized to handle complex types. When the template arguments for data types are `ck::complex`, the compiler instantiates a version of the kernel where the MAC operations are replaced with the sequence of real-valued operations required for complex multiplication.
+
+## Build and Run
+
+### Prerequisites
+Ensure the Composable Kernel library is built and installed.
+```bash
+cd /path/to/composable_kernel/build
+make -j install
+```
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/22_cgemm
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./cgemm_xdl
+
+# Run with verification, data initialization, and timing
+./cgemm_xdl 1 2 1
+```
+
+## Applications
+
+CGEMM is a critical kernel in many high-performance computing applications:
+
+-   **Digital Signal Processing (DSP)**: The Fast Fourier Transform (FFT), a cornerstone of DSP, can be implemented using complex matrix multiplications. Filtering and convolution in the frequency domain also rely on complex arithmetic.
+-   **Quantum Computing Simulation**: The state of a quantum system is described by a vector of complex numbers, and quantum gates are represented by unitary matrices (a special type of complex matrix). Simulating a quantum circuit involves a sequence of CGEMM operations.
+-   **Electromagnetics and Wave Physics**: Simulating the propagation of electromagnetic or acoustic waves often involves solving systems of equations with complex numbers to represent the phase and amplitude of the waves.
+-   **Communications**: Modern communication systems (like 5G and Wi-Fi) use complex modulation schemes (like QAM) where signals are represented by complex numbers.
--- a/example/23_softmax/README.md
+++ b/example/23_softmax/README.md
@@ -1,6 +1,36 @@
-# Instructions for ```example_softmax_blockwise```
+# Parallel Softmax
+
+## Theory
+
+This example demonstrates **parallel softmax computation** over tensors. Softmax is a key operation in deep learning, especially in attention mechanisms and classification, converting logits into normalized probabilities.
+
+**Mathematical Formulation:**
+Given input $X$ and axis $a$:
+$$
+\text{softmax}(X)_i = \frac{\exp(X_i)}{\sum_j \exp(X_j)}
+$$
+
+**Algorithmic Background:**
+- Softmax is implemented using a numerically stable algorithm:
+  1. Subtract the maximum value for numerical stability.
+  2. Exponentiate and sum.
+  3. Normalize by the sum.
+- Efficient parallel softmax requires careful reduction and memory access patterns.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/23_softmax
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+```

-## Run ```example_softmax_blockwise```
 ```bash
 # -D <xxx> : input 3-d tensor lengths
 # -v <x> :   verification (0=no, 1=yes)
@@ -16,3 +46,30 @@ Warm up 1 time
 Start running 10 times...
 Perf: 0.0242877 ms, 259.039 GB/s, DeviceReduceSoftmax<256,M_C8_S1,K_C32_S8,InSrcVectorDim_1_InSrcVectorSize_8_OutDstVectorSize_8>
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/23_softmax/
+├── softmax_xdl.cpp         # Main example: sets up, runs, and verifies softmax
+include/ck/tensor_operation/gpu/device/
+│   └── device_softmax.hpp       # Device-level softmax API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_softmax_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_softmax.hpp     # Grid-level softmax kernel
+include/ck/tensor_operation/gpu/block/
+    └── blockwise_softmax.hpp    # Block-level softmax
+```
+
+### Key Classes and Functions
+
+- **DeviceSoftmax** (in `device_softmax.hpp`):  
+  Device API for softmax.
+- **gridwise_softmax** (in `gridwise_softmax.hpp`):  
+  Implements the tiled/blocking softmax kernel.
+- **blockwise_softmax** (in `blockwise_softmax.hpp`):  
+  Handles block-level softmax and shared memory.
+
+This example demonstrates how Composable Kernel implements efficient, numerically stable softmax for deep learning models.
--- a/example/24_batched_gemm/README.md
+++ b/example/24_batched_gemm/README.md
@@ -0,0 +1,57 @@
+# Batched GEMM
+
+## Theory
+
+This example demonstrates **batched GEMM**: performing multiple independent matrix multiplications (all with the same shape) in a single kernel launch. Batched GEMM is used in multi-head attention, RNNs, and other models requiring parallel matrix multiplications.
+
+**Mathematical Formulation:**
+For $B$ batches:
+$$
+C_b = A_b \times B_b \quad \text{for} \quad b = 1, 2, ..., B
+$$
+- $A_b$: [M, K] input matrix for batch $b$
+- $B_b$: [K, N] weight matrix for batch $b$
+- $C_b$: [M, N] output matrix for batch $b$
+
+**Algorithmic Background:**
+- All matrices in the batch have the same shape and strides.
+- The kernel launches a grid covering all batches, with each block assigned to a batch.
+- Used for multi-head attention, parallel MLPs, and more.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/24_batched_gemm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./batched_gemm_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/24_batched_gemm/
+├── batched_gemm_xdl.cpp         # Main example: sets up, runs, and verifies batched GEMM
+include/ck/tensor_operation/gpu/device/
+│   └── device_batched_gemm_xdl.hpp       # Device-level batched GEMM API
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_batched_gemm_xdl.hpp     # Grid-level batched GEMM kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceBatchedGemmXdl** (in `device_batched_gemm_xdl.hpp`):  
+  Device API for batched GEMM.
+- **gridwise_batched_gemm_xdl** (in `gridwise_batched_gemm_xdl.hpp`):  
+  Implements the tiled/blocking batched GEMM kernel.
+
+This example demonstrates how Composable Kernel supports efficient parallel matrix multiplication for batched and multi-head workloads.
--- a/example/24_batched_gemm/run_batched_gemm_example.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example.inc
@@ -31,6 +31,7 @@ struct ExecutionConfig final
 bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
    using namespace ck::literals;
+    using Bypass = ck::tensor_layout::BypassLayoutVerification;

 #if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
@@ -62,12 +63,12 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
            return HostTensorDescriptor(
-                {batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
+                {batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
        }
        else
        {
            return HostTensorDescriptor(
-                {batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
+                {batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
        }
    };

--- a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
@@ -116,6 +116,7 @@ inline __host__ __device__ constexpr double get_atol()
 bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
    using namespace ck::literals;
+    using Bypass = ck::tensor_layout::BypassLayoutVerification;

    auto& [M,
           N,
@@ -138,12 +139,12 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
        if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
        {
            return HostTensorDescriptor(
-                {batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
+                {batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
        }
        else
        {
            return HostTensorDescriptor(
-                {batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
+                {batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
        }
    };

--- a/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
@@ -37,6 +37,7 @@ struct ExecutionConfig final
 bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
    using namespace ck::literals;
+    using Bypass = ck::tensor_layout::BypassLayoutVerification;

    auto& [M,
           N,
@@ -65,12 +66,12 @@ bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionCo
        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
            return HostTensorDescriptor(
-                {batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
+                {batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
        }
        else
        {
            return HostTensorDescriptor(
-                {batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
+                {batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
        }
    };

--- a/example/25_gemm_bias_e_permute/README.md
+++ b/example/25_gemm_bias_e_permute/README.md
@@ -0,0 +1,56 @@
+# GEMM with Bias, Elementwise, and Permute Fusion
+
+## Theory
+
+This example demonstrates **GEMM fused with bias addition, elementwise operation, and permutation**. This pattern is used in transformer models and other neural architectures where a linear transformation is followed by bias, activation, and layout transformation.
+
+**Mathematical Formulation:**
+- GEMM: $Y = A \times B$
+- Bias: $Z = Y + \text{bias}$
+- Elementwise: $E = f(Z)$ (e.g., activation)
+- Permute: $O = \text{permute}(E, \text{axes})$
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, bias and elementwise ops are fused in the epilogue, and permutation is applied before writing to global memory.
+- Permutation changes the layout/order of tensor axes (e.g., NCHW to NHWC).
+- This fusion reduces memory traffic and is common in transformer and CNN pipelines.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/25_gemm_bias_e_permute
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_bias_e_permute_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/25_gemm_bias_e_permute/
+├── gemm_bias_e_permute_xdl.cpp         # Main example: sets up, runs, and verifies GEMM+Bias+Elementwise+Permute
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_bias_e_permute.hpp       # Device-level API for fused GEMM
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_bias_e_permute_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_gemm_bias_e_permute.hpp     # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmBiasEPermute** (in `device_gemm_bias_e_permute.hpp`):  
+  Device API for GEMM fused with bias, elementwise, and permutation.
+- **gridwise_gemm_bias_e_permute** (in `gridwise_gemm_bias_e_permute.hpp`):  
+  Implements the tiled/blocking GEMM kernel with fused epilogue and permutation.
+
+This example demonstrates how Composable Kernel supports efficient fusion of linear, bias, activation, and layout operations for deep learning models.
--- a/example/26_contraction/README.md
+++ b/example/26_contraction/README.md
@@ -1,9 +1,80 @@
-# Instructions for ```example_contraction_bilinear_xdl_fp32```
+# Tensor Contraction

-## Run
+This example demonstrates a tensor contraction operation, which is a generalization of matrix multiplication to tensors of arbitrary rank (or number of dimensions). Tensor contractions are fundamental to many algorithms in physics, chemistry, and machine learning, particularly in the field of tensor networks.
+
+## Mathematical Formulation
+
+A tensor contraction sums the product of two tensors over a specified set of indices. It is most clearly expressed using Einstein summation notation (einsum).
+
+For example, a standard matrix multiplication $C_{ik} = \sum_j A_{ij} B_{jk}$ is written in einsum notation as:
+`ik = ij,jk`
+
+A tensor contraction can involve more dimensions and more contracted indices. For instance, contracting a 3D tensor `A` with a 4D tensor `B`:
+$D_{imn} = \sum_{j,k} A_{ijk} B_{kjmn}$
+In einsum notation, this is:
+`imn = ijk,kjmn`
+
+Here, the `j` and `k` indices are the "contracted" or "summation" indices, while `i`, `m`, and `n` are the "free" or "output" indices.
+
+Composable Kernel's contraction operation can perform any such contraction, provided there is a clear distinction between contracted indices and free indices for each tensor.
+
+## Algorithmic Strategy: Mapping Contraction to GEMM
+
+The dominant strategy for performing tensor contractions efficiently on GPUs is to reshape or "flatten" the input tensors into 2D matrices, perform a standard, highly-optimized GEMM, and then reshape the resulting matrix back into the desired output tensor shape.
+
+1.  **Tensor-to-Matrix Reshaping**:
+    -   The dimensions of each input tensor are partitioned into two sets: the contracted dimensions and the free (non-contracted) dimensions.
+    -   The tensor is then treated as a 2D matrix by flattening all the free dimensions into the "row" dimension (M for tensor A, N for tensor B) and all the contracted dimensions into the "column" dimension (K).
+    -   For example, in the contraction `imn = ijk,kjmn`:
+        -   Tensor A (`ijk`): Free index is `i`, contracted indices are `jk`. It is reshaped into a matrix A' of shape `[i, (j*k)]`.
+        -   Tensor B (`kjmn`): Free indices are `mn`, contracted indices are `kj`. It is reshaped into a matrix B' of shape `[(k*j), (m*n)]`.
+        -   The GEMM computes `D' = A' x B'`. The resulting matrix D' has shape `[i, (m*n)]`.
+
+2.  **High-Performance GEMM**: A standard, block-tiled GEMM kernel is used to perform the matrix multiplication `A' x B'`. This is the computationally intensive part of the operation.
+
+3.  **Output Reshaping**: The resulting 2D matrix `D'` is then logically reshaped back into the desired multi-dimensional output tensor `D` of shape `[i, m, n]`.
+
+Crucially, the reshaping operations are often *logical*. The data is not physically moved or transposed in global memory. Instead, the GEMM kernel is provided with a "tensor descriptor" that understands the original N-dimensional layout and can calculate the correct memory addresses for the flattened 2D view on the fly. This avoids costly data movement and is key to performance.
+
+## Source Code Organization
+
+-   [`contraction_xdl.cpp`](./contraction_xdl.cpp): The main example file. It defines the input tensors and their layouts, specifies the contraction indices, and instantiates the `DeviceContraction` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp`](../../include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp): The high-level device interface for the contraction operation. It is highly generic and takes tensor descriptors that define the complex layouts and index mappings.
+-   The device interface internally creates a plan to map the contraction to a GEMM, then calls a standard `DeviceGemm` instance to execute it. The intelligence lies in how the tensor descriptors are configured to present a 2D matrix view of the higher-dimensional tensor data to the underlying GEMM kernel.
+
+## Build and Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build the Example
 ```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: time kernel (0=no, 1=yes)
-./bin/example_contraction_bilinear_xdl_fp32 1 1 1
+cd /path/to/composable_kernel/example/26_contraction
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
 ```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./contraction_xdl
+
+# Run with verification, data initialization, and timing
+./contraction_xdl 1 2 1
+```
+
+## Applications
+
+Tensor contractions are the core computational primitive in a wide range of fields:
+
+-   **Tensor Network Methods**: In physics and chemistry, methods like DMRG (Density Matrix Renormalization Group) and PEPS (Projected Entangled Pair States) use networks of interconnected tensors to represent complex quantum states. The simulation of these systems involves sequences of tensor contractions.
+-   **High-Order Statistics**: In data analysis, computing higher-order moments (like skewness or kurtosis) can be expressed as tensor contractions.
+-   **Relativistic Physics**: Many equations in general relativity are expressed in the language of tensors and involve contractions.
+-   **Advanced Deep Learning Models**: Some research models, particularly in areas like quantum machine learning or geometric deep learning, use tensor contractions as a primary layer type, going beyond the capabilities of standard matrix multiplication.
--- a/example/27_layernorm2d_fwd/README.md
+++ b/example/27_layernorm2d_fwd/README.md
@@ -0,0 +1,59 @@
+# 2D Layer Normalization Forward
+
+## Theory
+
+This example demonstrates **2D layer normalization forward pass**. Layer normalization is used in transformers and other neural networks to normalize activations across the feature dimension, improving training stability.
+
+**Mathematical Formulation:**
+Given input $X[N, C, H, W]$:
+- Mean: $\mu = \frac{1}{CHW} \sum_{c,h,w} X_{n,c,h,w}$
+- Variance: $\sigma^2 = \frac{1}{CHW} \sum_{c,h,w} (X_{n,c,h,w} - \mu)^2$
+- Normalized: $\hat{X}_{n,c,h,w} = \frac{X_{n,c,h,w} - \mu}{\sqrt{\sigma^2 + \epsilon}}$
+- Output: $Y_{n,c,h,w} = \gamma \hat{X}_{n,c,h,w} + \beta$
+
+$\gamma$, $\beta$ are learnable scale and shift parameters.
+
+**Algorithmic Background:**
+- Computes mean and variance per sample (across all features).
+- Applies normalization and affine transformation.
+- Used in transformer blocks and normalization layers.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/27_layernorm2d_fwd
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./layernorm2d_fwd_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/27_layernorm2d_fwd/
+├── layernorm2d_fwd_xdl.cpp         # Main example: sets up, runs, and verifies 2D layernorm
+include/ck/tensor_operation/gpu/device/
+│   └── device_layernorm_fwd.hpp       # Device-level layernorm API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_layernorm_fwd_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_layernorm_fwd.hpp     # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceLayernormFwd** (in `device_layernorm_fwd.hpp`):  
+  Device API for layer normalization.
+- **gridwise_layernorm_fwd** (in `gridwise_layernorm_fwd.hpp`):  
+  Implements the tiled/blocking layernorm kernel.
+
+This example demonstrates how Composable Kernel implements efficient layer normalization for transformer and deep learning models.
--- a/example/28_grouped_gemm_bias_e_permute/README.md
+++ b/example/28_grouped_gemm_bias_e_permute/README.md
@@ -0,0 +1,84 @@
+# Grouped GEMM with Bias, Elementwise Operation, and Permutation
+
+This example demonstrates a highly complex and specialized fusion: a **Grouped GEMM** where each individual GEMM operation is fused with a bias addition, a second elementwise operation, and a final permutation of the output. This kernel is designed to accelerate layers that have a group-parallel structure, such as depthwise separable convolutions or multi-head attention, when they are part of a larger fused computational graph.
+
+## Mathematical Formulation
+
+This operation performs `G` independent fused GEMM operations in parallel, where `G` is the group count. For each group `g` from `0` to `G-1`:
+
+1.  **GEMM Stage**: A standard matrix multiplication.
+    $C_{temp1[g]} = A_{[g]} \times B_{[g]}$
+
+2.  **Bias Addition Stage**: A bias vector `D_[g]` is broadcast and added.
+    $C_{temp2[g]} = C_{temp1[g]} + D_{[g]}$
+
+3.  **Elementwise Stage**: A second elementwise operation is performed with tensor `E_[g]`.
+    $C_{temp3[g]} = C_{temp2[g]} \odot E_{[g]}$
+
+4.  **Permutation Stage**: The final result for the group is permuted.
+    $F_{[g]} = \text{permute}(C_{temp3[g]})$
+
+All four stages for all `G` groups are executed within a single kernel launch. The intermediate results are kept in registers and never written to global memory.
+
+## Algorithmic Strategy: Group-Parallel GEMM with Fused Epilogue
+
+The implementation combines the scheduling strategy of Grouped GEMM with the multi-stage fused epilogue seen in `25_gemm_bias_e_permute`.
+
+1.  **Group Scheduling**: The `G` independent problems are distributed across the GPU's thread blocks. The grid-wise kernel is designed such that each thread block is assigned to compute one of the `G` fused operations.
+
+2.  **Fused GEMM Execution**: Once a thread block is assigned a group `g`, it executes a complete fused GEMM for that group's specific data. This involves:
+    -   Calculating the base memory addresses for $A_{[g]}, B_{[g]}, D_{[g]}, E_{[g]}$, and $F_{[g]}$ using the group index and the problem description for that group.
+    -   Executing a standard tiled GEMM for $A_{[g]} \times B_{[g]}$, accumulating the result in registers.
+    -   Executing the fused epilogue:
+        -   Load the bias `D_[g]` and add it.
+        -   Load the elementwise tensor `E_[g]` and apply the operation.
+        -   Calculate the permuted destination coordinates and write the final result to `F_[g]`.
+
+This approach maximizes parallelism at two levels: the coarse-grained parallelism across the `G` groups, and the fine-grained data parallelism within each individual GEMM operation.
+
+## Source Code Organization
+
+-   [`grouped_gemm_bias_e_permute_xdl.cpp`](./grouped_gemm_bias_e_permute_xdl.cpp): The main example file. It demonstrates the complex setup for a grouped problem, defining the `G` sets of input tensors and the permutation. It then instantiates the `DeviceGroupedGemmBiasEPermute` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_bias_e_permute_impl.hpp`](../../include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_bias_e_permute_impl.hpp): The high-level device interface for this specific fused operation. It takes arrays of tensor descriptors, one for each group.
+-   The underlying grid-wise kernel contains the logic to map thread blocks to groups and then execute the full fused GEMM pipeline for the assigned group.
+
+## Build and Run
+
+### Prerequisites
+Ensure the Composable Kernel library is built and installed.
+```bash
+cd /path/to/composable_kernel/build
+make -j install
+```
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/28_grouped_gemm_bias_e_permute
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./grouped_gemm_bias_e_permute_xdl
+
+# Run with verification, data initialization, and timing
+./grouped_gemm_bias_e_permute_xdl 1 2 1
+```
+
+## Applications
+
+This highly specialized kernel is valuable for optimizing specific patterns in modern neural networks:
+
+-   **Multi-Head Attention (MHA)**: The computation for each head in MHA is independent. The entire MHA block can be viewed as a Grouped GEMM where the number of groups `G` is the number of attention heads. If the Q, K, or V projections involve fusions with bias, other elementwise ops, and permutations to prepare the data for the batched GEMM, this kernel could potentially fuse a large part of that logic.
+-   **Depthwise Separable Convolutions**: The depthwise part of this convolution is a Grouped GEMM with `G` equal to the number of channels. If this is followed by a fused activation function (e.g., a gated activation) and a permutation, this kernel could be a perfect match.
+-   **Mixture-of-Experts (MoE) Models**: In MoE layers, an input is routed to one of several "expert" sub-networks. If these experts have identical structure, their execution can be formulated as a Grouped GEMM, where `G` is the number of experts. Any fusions within the expert network could be captured by this kernel.
+
+This example showcases the extreme composability of the library, allowing for the creation of highly tailored, high-performance kernels for complex, group-parallel computational graphs.
--- a/example/29_batched_gemm_bias_e_permute/README.md
+++ b/example/29_batched_gemm_bias_e_permute/README.md
@@ -0,0 +1,91 @@
+# Batched GEMM with Bias, Elementwise Operation, and Permutation
+
+This example demonstrates a **Batched GEMM** where each individual GEMM operation is fused with a bias addition, a second elementwise operation, and a final permutation of the output. This kernel is designed to accelerate layers that have a batch-parallel structure, such as the dense layers in a Transformer's feed-forward network, when they are part of a larger fused computational graph.
+
+## Mathematical Formulation
+
+This operation performs `B` independent fused GEMM operations in parallel, where `B` is the batch count. For each batch item `b` from `0` to `B-1`:
+
+1.  **GEMM Stage**: A standard matrix multiplication.
+    $C_{temp1[b]} = A_{[b]} \times B_{[b]}$
+
+2.  **Bias Addition Stage**: A bias vector `D_[b]` is broadcast and added.
+    $C_{temp2[b]} = C_{temp1[b]} + D_{[b]}$
+
+3.  **Elementwise Stage**: A second elementwise operation is performed with tensor `E_[b]`.
+    $C_{temp3[b]} = C_{temp2[b]} \odot E_{[b]}$
+
+4.  **Permutation Stage**: The final result for the batch item is permuted.
+    $F_{[b]} = \text{permute}(C_{temp3[b]})$
+
+All four stages for all `B` batch items are executed within a single kernel launch. The intermediate results are kept in registers and never written to global memory.
+
+**Distinction from Grouped Version**:
+-   In this **Batched** version, all `B` problems are uniform. They share the same dimensions (M, N, K), layouts, and permutations. The input/output tensors are accessed with a constant batch stride.
+-   In the **Grouped** version (`28_grouped_gemm_bias_e_permute`), each of the `G` problems can have different dimensions, layouts, and strides, offering more flexibility.
+
+## Algorithmic Strategy: Batch-Parallel GEMM with Fused Epilogue
+
+The implementation combines the scheduling strategy of Batched GEMM with the multi-stage fused epilogue.
+
+1.  **Batch Scheduling**: The `B` independent problems are distributed across the GPU's thread blocks. The grid-wise kernel is designed such that each thread block is assigned to compute one of the `B` fused operations.
+
+2.  **Fused GEMM Execution**: Once a thread block is assigned a batch item `b`, it executes a complete fused GEMM for that item's specific data. This involves:
+    -   Calculating the base memory addresses for $A_{[b]}, B_{[b]}, D_{[b]}, E_{[b]}$, and $F_{[b]}$ using the batch index and the constant batch stride.
+    -   Executing a standard tiled GEMM for $A_{[b]} \times B_{[b]}$, accumulating the result in registers.
+    -   Executing the fused epilogue:
+        -   Load the bias `D_[b]` and add it.
+        -   Load the elementwise tensor `E_[b]` and apply the operation.
+        -   Calculate the permuted destination coordinates and write the final result to `F_{[b]`.
+
+This approach is extremely efficient when the batch size `B` is large enough to saturate the GPU's parallelism.
+
+## Source Code Organization
+
+-   [`batched_gemm_bias_e_permute_xdl.cpp`](./batched_gemm_bias_e_permute_xdl.cpp): The main example file. It sets up the batched problem, defining the batch size, strides, and the single permutation rule that applies to all batch items. It then instantiates the `DeviceBatchedGemmBiasEPermute` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_bias_e_permute_impl.hpp`](../../include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_bias_e_permute_impl.hpp): The high-level device interface for this specific fused operation.
+-   The underlying grid-wise kernel contains the logic to map thread blocks to batch items (`block_to_batch`) and then execute the full fused GEMM pipeline for the assigned item.
+
+## Build and Run
+
+### Prerequisites
+Ensure the Composable Kernel library is built and installed.
+```bash
+cd /path/to/composable_kernel/build
+make -j install
+```
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/29_batched_gemm_bias_e_permute
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./batched_gemm_bias_e_permute_xdl
+
+# Run with verification, data initialization, and timing
+./batched_gemm_bias_e_permute_xdl 1 2 1
+```
+
+## Applications
+
+This kernel is ideal for optimizing the feed-forward network (FFN) block in a Transformer, especially when layout transformations are needed between layers.
+
+A typical Transformer FFN block is:
+`FFN(X) = Linear_2(ReLU(Linear_1(X)))`
+
+-   `Linear_1` is a GEMM.
+-   `ReLU` is an elementwise activation.
+-   `Linear_2` is another GEMM.
+
+Sometimes, for performance reasons (e.g., to align with a subsequent layer's expected input layout), the output of the FFN needs to be permuted. This kernel could fuse the `Linear_2` GEMM with its bias, a subsequent elementwise operation (if any), and the final permutation, all while operating on a batch of input sequences. This avoids multiple kernel launches and saves significant memory bandwidth, leading to faster model execution.
--- a/example/30_grouped_conv_fwd_multiple_d/README.md
+++ b/example/30_grouped_conv_fwd_multiple_d/README.md
@@ -1,4 +1,63 @@
-Command
+# Grouped Convolution Forward with Multiple Elementwise Inputs
+
+This example demonstrates a **Grouped Convolution Forward Pass** fused with an elementwise operation that takes multiple auxiliary input tensors (`D` tensors). This is a powerful fusion that combines the parallel structure of grouped convolutions with the ability to merge subsequent elementwise layers, such as custom activations or residual connections, into a single kernel.
+
+## Mathematical Formulation
+
+This operation performs `G` independent fused convolution operations in parallel, where `G` is the group count. For each group `g` from `0` to `G-1`:
+
+1.  **Convolution Stage**: A standard N-dimensional forward convolution is performed for the group.
+    $C_{out[g]} = \text{Conv}(\text{In}_{[g]}, \text{W}_{[g]})$
+
+2.  **Elementwise Stage**: The result of the convolution is combined with one or more auxiliary tensors ($D_{0[g]}, D_{1[g]}, \dots$) using a user-defined elementwise function `f`.
+    $E_{[g]} = f(C_{out[g]}, D_{0[g]}, D_{1[g]}, \dots)$
+
+The key optimization is that the intermediate convolution result, $C_{out[g]}$, is never written to global memory. It is computed and held in registers, then immediately consumed by the elementwise part of the kernel's epilogue before the final result `E` is stored.
+
+## Algorithmic Strategy: Implicit Grouped GEMM with Fused Multi-D Epilogue
+
+The implementation combines three core concepts: the implicit GEMM transformation for convolutions, the group-parallel scheduling of Grouped GEMM, and a multi-input fused epilogue.
+
+1.  **Group Scheduling**: The `G` independent problems are distributed across the GPU's thread blocks. Each thread block is assigned to compute the fused convolution for one of the `G` groups.
+
+2.  **Implicit GEMM Core**: Once a thread block is assigned a group `g`, it executes the convolution for that group using the implicit GEMM algorithm. This involves:
+    -   Calculating the base memory addresses for the group's input tensors: $\text{In}_{[g]}, \text{W}_{[g]}, D_{0[g]}, \dots, E_{[g]}$.
+    -   Performing a tiled GEMM, where tiles of the input `In` and weights `W` are read (with the `im2col` transformation happening on-the-fly) and the result is accumulated in registers.
+
+3.  **Fused Multi-D Epilogue**: Before writing the result to global memory, the epilogue performs the elementwise fusion:
+    -   Threads load the corresponding tiles from the auxiliary `D` tensors for the assigned group.
+    -   The user-defined elementwise function `f` is applied in registers to the convolution result and the `D` tensor values.
+    -   The final result `E` for the group is written to global memory.
+
+This strategy is highly efficient as it minimizes memory bandwidth by avoiding the materialization of the intermediate convolution output and maximizes parallelism by executing all groups concurrently.
+
+## Source Code Organization
+
+-   [`grouped_conv_fwd_multiple_d_xdl.cpp`](./grouped_conv_fwd_multiple_d_xdl.cpp): The main example file. It sets up the grouped convolution problem, including the multiple `D` tensors, and instantiates the `DeviceGroupedConvFwdMultipleD` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp`](../../include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp): The high-level device interface for this operation. It takes arrays of tensor descriptors, one for each group for each of the `D` tensors.
+-   The underlying grid-wise kernel contains the logic to map thread blocks to groups and then execute the full implicit GEMM pipeline with the fused multi-D epilogue for the assigned group.
+
+## Build and Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/30_grouped_conv_fwd_multiple_d
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+
 ```bash
 arg1: verification (0=no, 1=yes)
 arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -16,3 +75,10 @@ Following arguments (depending on number of spatial dims):
 ./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
 ```

+## Applications
+
+This kernel is ideal for optimizing layers in modern CNNs that use grouped convolutions followed by complex activations or residual connections.
+
+-   **Fused Residual Connections**: A common pattern is `Conv(x) + x`. This can be implemented by passing the input `x` as a `D` tensor and defining the elementwise function as `f(conv_out, d0) = conv_out + d0`. If this is a grouped convolution, this kernel is a perfect fit.
+-   **Custom Gated Activations**: Some architectures use gated activations, such as `Conv_A(x) * sigmoid(Conv_B(x))`. While this kernel doesn't compute two convolutions, it can fuse one convolution with an elementwise multiplication against another tensor. For example, it could compute `Conv_A(x) * D0`, where `D0` is the pre-computed `sigmoid(Conv_B(x))`.
+-   **Depthwise Separable Convolutions**: These layers consist of a depthwise convolution (a grouped convolution with `G = C`) followed by a pointwise convolution (`1x1` conv). If there is a residual connection or other elementwise operation after the depthwise stage, this kernel can fuse it directly, improving the performance of this widely used building block.
--- a/example/31_batched_gemm_gemm/README.md
+++ b/example/31_batched_gemm_gemm/README.md
@@ -0,0 +1,73 @@
+# Fused Batched GEMM-GEMM
+
+This example demonstrates a **Batched GEMM-GEMM** operation, where two sequential General Matrix-Matrix Multiplications are fused into a single high-performance kernel. This pattern is common in multi-layer perceptrons (MLPs) and is a core component of the feed-forward network (FFN) block in Transformer models.
+
+## Mathematical Formulation
+
+The operation computes a chain of two matrix multiplications, batched `B` times. For each batch item `b` from `0` to `B-1`:
+
+1.  **First GEMM (GEMM0)**:
+    $D_{temp[b]} = A_{[b]} \times B_{[b]}$
+    Where `A` has shape `[B, M, K0]`, `B` has shape `[B, K0, N]`. The intermediate result `D_temp` has shape `[B, M, N]`.
+
+2.  **Second GEMM (GEMM1)**:
+    $E_{[b]} = D_{temp[b]} \times C_{[b]}$
+    Where `D_temp` (the output of GEMM0) has shape `[B, M, N]` and `C` has shape `[B, N, K1]`. The final output `E` has shape `[B, M, K1]`.
+
+The critical optimization is that the intermediate tensor `D_temp` is **never written to global memory**. It is produced and consumed entirely within the GPU's on-chip memory (registers and LDS/shared memory), saving a massive amount of memory bandwidth.
+
+## Algorithmic Strategy: Fused GEMM-GEMM via Shared Memory
+
+The implementation uses a batch-parallel approach where each thread block is assigned a single batch item. Within the block, the two GEMMs are fused using shared memory as a buffer.
+
+1.  **Batch Scheduling**: The `B` independent GEMM-GEMM problems are distributed across the GPU's thread blocks. Each thread block is assigned to compute the full chain for one batch item `b`.
+
+2.  **Fused Execution within a Thread Block**:
+    -   **Compute GEMM0 Tile**: The thread block first computes a tile of the intermediate tensor, $D_{temp[b]}$, using a standard tiled GEMM algorithm. The result of this computation is stored directly into a designated region of **shared memory (LDS)**.
+    -   **Synchronization**: A block-wide synchronization (`__syncthreads()`) is performed. This is a critical step that ensures the *entire* tile of $D_{temp[b]}$ is visible to all threads in the block before the second GEMM begins.
+    -   **Compute GEMM1 Tile**: The threads then immediately start computing the second GEMM. They use the intermediate tile stored in shared memory as the "A" matrix for this second GEMM, multiplying it with tiles of the `C` matrix. The result is accumulated in registers.
+    -   **Store Final Result**: Once a tile of the final output `E` is computed, it is written to global memory.
+
+This "producer-consumer" pattern within a thread block is highly efficient. It treats shared memory as a fast, programmable cache for the intermediate tensor, completely avoiding the slow round-trip to global HBM memory.
+
+## Source Code Organization
+
+-   [`batched_gemm_gemm_xdl.cpp`](./batched_gemm_gemm_xdl.cpp): The main example file. It sets up the three input tensors (A, B, C) for the batched operation and instantiates the `DeviceBatchedGemmGemm` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp`](../../include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp): The high-level device interface for the fused Batched GEMM-GEMM operation.
+-   The underlying grid-wise kernel implements the complex fusion logic, managing the register usage for GEMM0, the write to shared memory, the synchronization, and the subsequent computation of GEMM1 using the data from shared memory.
+
+## Build and Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/31_batched_gemm_gemm
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./batched_gemm_gemm_xdl
+
+# Run with verification, data initialization, and timing
+./batched_gemm_gemm_xdl 1 2 1
+```
+
+## Application to Transformer FFN
+
+This kernel is perfectly suited to optimize the Feed-Forward Network (FFN) block found in every layer of a Transformer model. The FFN is typically defined as:
+
+`FFN(X) = Linear_2(Activation(Linear_1(X)))`
+
+Where `Linear_1` and `Linear_2` are dense layers (GEMMs). If the activation function can also be fused (e.g., ReLU or GeLU), an even more complex kernel can be used. However, this `GEMM-GEMM` kernel provides the core fusion for the two most computationally expensive parts of the FFN. By fusing `Linear_1` and `Linear_2`, this kernel can significantly reduce the latency and memory bandwidth of the FFN block, leading to faster end-to-end model training and inference.
--- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
+++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

+using Bypass = ck::tensor_layout::BypassLayoutVerification;
+
 bool run_batched_gemm_gemm_example(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -111,12 +113,12 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
        if(std::is_same<decltype(layout), Row>::value)
        {
            return HostTensorDescriptor(
-                {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
+                {batch_count, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
        }
        else
        {
            return HostTensorDescriptor(
-                {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
+                {batch_count, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
        }
    };

--- a/example/32_batched_gemm_scale_softmax_gemm/README.md
+++ b/example/32_batched_gemm_scale_softmax_gemm/README.md
@@ -0,0 +1,61 @@
+# Batched GEMM-Scale-Softmax-GEMM: Fused Attention
+
+## Theory
+
+This example demonstrates the **fused attention mechanism** used in transformer models, implementing the sequence: batched Q×K^T → scaling → softmax → ×V in a single kernel. This pattern is critical for efficient transformer inference and training.
+
+**Mathematical Formulation:**
+- Attention: $\text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$
+- $Q$: [B, H, N, d_k] queries
+- $K$: [B, H, N, d_k] keys
+- $V$: [B, H, N, d_v] values
+- $O$: [B, H, N, d_v] output
+
+**Algorithmic Background:**
+- Computes Q×K^T, scales by $1/\sqrt{d_k}$, applies softmax, then multiplies by V.
+- Uses numerically stable softmax and memory-efficient tiling.
+- Used in multi-head attention and transformer blocks.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/32_batched_gemm_scale_softmax_gemm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./batched_gemm_scale_softmax_gemm_xdl --batch=32 --heads=12 --seq_len=512 --head_dim=64 --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/32_batched_gemm_scale_softmax_gemm/
+├── batched_gemm_scale_softmax_gemm_xdl.cpp         # Main example: sets up, runs, and verifies fused attention
+include/ck/tensor_operation/gpu/device/
+│   └── device_batched_gemm_scale_softmax_gemm.hpp       # Device-level fused attention API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_batched_attention_impl.hpp                # Attention-specific implementation
+│   └── device_online_softmax_impl.hpp                   # Online softmax implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_batched_gemm_softmax.hpp                # Grid-level fused attention kernel
+│   └── gridwise_online_softmax.hpp                      # Grid-level online softmax
+```
+
+### Key Classes and Functions
+
+- **DeviceBatchedGemmScaleSoftmaxGemm** (in `device_batched_gemm_scale_softmax_gemm.hpp`):  
+  Device API for fused attention.
+- **gridwise_batched_gemm_softmax** (in `gridwise_batched_gemm_softmax.hpp`):  
+  Implements the tiled/blocking fused attention kernel.
+- **gridwise_online_softmax** (in `gridwise_online_softmax.hpp`):  
+  Implements numerically stable, memory-efficient softmax.
+
+This example demonstrates how Composable Kernel implements efficient, fused attention for transformer and large language models.
--- a/example/33_multiple_reduce/README.md
+++ b/example/33_multiple_reduce/README.md
@@ -1,6 +1,73 @@
-# Instructions for ```example_dual_reduce```
+# Multiple Reductions
+
+This example demonstrates a **Multiple Reduction** operation, where several different reduction computations (e.g., sum, average, max, min) are performed on the same input tensor in a single kernel launch. This is a highly efficient pattern when multiple statistics are needed for a tensor, as it requires only one read pass over the (potentially very large) input data.
+
+## Mathematical Formulation
+
+Given an input tensor `A`, this operation computes a set of output scalars or vectors, $\{R_0, R_1, \dots, R_N\}$, where each $R_i$ is the result of a different reduction operation applied to `A`.
+
+$R_0 = \bigoplus_0 A$
+$R_1 = \bigoplus_1 A$
+...
+$R_N = \bigoplus_N A$
+
+Where $\bigoplus_i$ represents a distinct reduction operation, such as:
+-   `sum`: $\sum_j A_j$
+-   `avg`: $\frac{1}{N} \sum_j A_j$
+-   `max`: $\max_j(A_j)$
+-   `min`: $\min_j(A_j)$
+-   `sum of squares`: $\sum_j A_j^2$
+
+The reductions can be performed over the entire tensor to produce a scalar, or along specific dimensions to produce a lower-rank tensor.
+
+## Algorithmic Strategy: Fused Parallel Reduction
+
+The implementation uses a classic parallel reduction algorithm but extends it to handle multiple reduction functions simultaneously.
+
+1.  **Grid Scheduling**: The input tensor is partitioned across the GPU's thread blocks. Each block is responsible for reducing a slice of the input data.
+
+2.  **Intra-Block Reduction**:
+    -   **Loading**: Threads within a block cooperatively load their assigned slice of the input tensor `A` into shared memory.
+    -   **Fused Accumulation**: Each thread maintains a separate set of accumulators in its private registers, one for each of the `N` reduction operations being performed.
+    -   As threads iterate through the data in shared memory, they update all of their accumulators simultaneously. For example, for each element `a`, a thread might update its `sum_accumulator += a`, `max_accumulator = max(max_accumulator, a)`, and `sum_sq_accumulator += a*a`.
+    -   **Tree-Based Reduction**: After processing all elements in the slice, the threads perform a parallel reduction using shared memory. This is done *for each of the N reduction types*. For example, they first reduce all the `sum_accumulator` values to get the block's partial sum, then they reduce all the `max_accumulator` values to get the block's partial max, and so on.
+
+3.  **Inter-Block Reduction**:
+    -   Each thread block writes its `N` partial results (the block's partial sum, partial max, etc.) to `N` separate temporary arrays in global memory.
+    -   A final, small reduction kernel is launched (or atomic operations are used) for each of the `N` temporary arrays to combine the partial results from all blocks into the final `N` output values.
+
+The key to this kernel's efficiency is that the expensive part—reading the input tensor `A` from global memory—is only done once. All subsequent computations happen on-chip.
+
+## Source Code Organization
+
+-   [`multiple_reduce_xdl.cpp`](./multiple_reduce_xdl.cpp): The main example file. It sets up the input tensor and defines the multiple reduction operations to be performed. It then instantiates the `DeviceMultipleReduce` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp`](../../include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp): The high-level device interface for the multiple reduction operation. It takes a tuple of structs, where each struct defines one of the reduction operations to be performed.
+-   [`../../include/ck/tensor_operation/gpu/grid/gridwise_multiple_reduce.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_multiple_reduce.hpp): The grid-wise kernel that implements the fused parallel reduction algorithm. It is heavily templated to generate the specific accumulation and reduction logic for the requested set of operations.
+
+## Build and Run
+
+### Prerequisites
+Ensure the Composable Kernel library is built and installed.
+```bash
+cd /path/to/composable_kernel/build
+make -j install
+```
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/33_multiple_reduce
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run  ```example_dual_reduce_multiblock```

-## Run ```example_dual_reduce_multiblock```
 ```bash
 # -D <xxx> : input 4-d tensor lengths
 # -v <x> :   verification (0=no, 1=yes)
@@ -18,7 +85,7 @@ Start running 10 times...
 Perf: 1.19529 ms, 201.499 GB/s, DeviceMultipleReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_1_InSrcVectorSize_1,OutDstVectorSize_1_1>
 ```

-## Run ```example_dual_reduce_threadwise```
+### Run ```example_dual_reduce_threadwise```
 ```bash
 # -D <xxx> : input 4-d tensor lengths
 # -v <x> :   verification (0=no, 1=yes)
@@ -35,3 +102,11 @@ Warm up 1 time
 Start running 10 times...
 Perf: 0.01512 ms, 71.9577 GB/s, DeviceMultipleReduceThreadwise<256,M_C256_S1,K_C1_S4,InSrcVectorDim_1_InSrcVectorSize_2,OutDstVectorSize_1_1>
 ```
+
+## Applications
+
+This operation is extremely useful for computing statistics and implementing normalization layers.
+
+-   **Normalization Layers**: Both Batch Normalization and Layer Normalization require computing the mean and variance of a tensor. Variance is defined as $\sigma^2 = E[X^2] - (E[X])^2$. This requires two statistics: the sum of elements (for the mean, $E[X]$) and the sum of squares of elements (for $E[X^2]$). This kernel can compute both in a single pass, making it a highly efficient way to calculate the moments needed for normalization.
+-   **Data Analytics**: When analyzing a large dataset, one might want to compute its min, max, mean, and standard deviation all at once. This kernel can perform all the necessary underlying reductions in a single, efficient operation.
+-   **Loss Function Components**: Some complex loss functions might involve multiple statistical properties of a model's output. This kernel can compute them efficiently.
--- a/example/34_batchnorm/README.md
+++ b/example/34_batchnorm/README.md
@@ -1,4 +1,39 @@
-# Instructions for ```batchnorm nhwc``` Example
+# Batch Normalization Forward
+
+## Theory
+
+This example demonstrates **batch normalization forward pass**. Batch normalization is used in deep neural networks to normalize activations across the batch dimension, improving training stability and convergence.
+
+**Mathematical Formulation:**
+Given input $X[N, C, ...]$:
+- Mean: $\mu_c = \frac{1}{N \cdot ...} \sum_{n,...} X_{n,c,...}$
+- Variance: $\sigma^2_c = \frac{1}{N \cdot ...} \sum_{n,...} (X_{n,c,...} - \mu_c)^2$
+- Normalized: $\hat{X}_{n,c,...} = \frac{X_{n,c,...} - \mu_c}{\sqrt{\sigma^2_c + \epsilon}}$
+- Output: $Y_{n,c,...} = \gamma_c \hat{X}_{n,c,...} + \beta_c$
+
+$\gamma_c$, $\beta_c$ are learnable scale and shift parameters per channel.
+
+**Algorithmic Background:**
+- Computes mean and variance per channel (across batch and spatial dimensions).
+- Applies normalization and affine transformation.
+- Used in CNNs, MLPs, and other deep learning models.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/34_batchnorm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./batchnorm_fwd_xdl --verify=1 --time=1
+```

 ## Run ```batchnorm forward nhwc```
 ```bash
@@ -79,3 +114,26 @@ Warm up 1 time
 Start running 10 times...
 Perf: 0.411026 ms, 91.8702 GB/s
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/34_batchnorm/
+├── batchnorm_fwd_xdl.cpp         # Main example: sets up, runs, and verifies batchnorm
+include/ck/tensor_operation/gpu/device/
+│   └── device_batchnorm_fwd.hpp       # Device-level batchnorm API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_batchnorm_fwd_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_batchnorm_fwd.hpp     # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceBatchnormFwd** (in `device_batchnorm_fwd.hpp`):  
+  Device API for batch normalization.
+- **gridwise_batchnorm_fwd** (in `gridwise_batchnorm_fwd.hpp`):  
+  Implements the tiled/blocking batchnorm kernel.
+
+This example demonstrates how Composable Kernel implements efficient batch normalization for deep learning models.
--- a/example/35_splitK_gemm/README.md
+++ b/example/35_splitK_gemm/README.md
@@ -0,0 +1,82 @@
+# GEMM with K-Axis Splitting (Split-K GEMM)
+
+This example demonstrates a General Matrix-Matrix Multiplication (GEMM) implemented with a **Split-K** algorithm. This is a technique used to increase the available parallelism for a single, large GEMM operation, which can lead to higher performance, especially on GPUs with a very large number of compute units.
+
+## Mathematical Formulation
+
+A standard GEMM computes the matrix product $C = A \times B$, where `A` has shape `[M, K]` and `B` has shape `[K, N]`. The computation is:
+$C_{ij} = \sum_{k=0}^{K-1} A_{ik} B_{kj}$
+
+In a Split-K algorithm, the `K` dimension is split into `S` chunks of size `K_split = K / S`. The GEMM is then broken down into `S` smaller, partial GEMMs.
+
+For each split `s` from `0` to `S-1`:
+-   Let $A_s$ be the s-th slice of `A` along the K-axis (shape `[M, K_split]`).
+-   Let $B_s$ be the s-th slice of `B` along the K-axis (shape `[K_split, N]`).
+-   A partial product is computed: $C_s = A_s \times B_s$.
+
+The final result `C` is the sum of all the partial products:
+$C = \sum_{s=0}^{S-1} C_s = C_0 + C_1 + \dots + C_{S-1}$
+
+## Algorithmic Strategy: Parallel Reduction of Partial GEMMs
+
+The Split-K algorithm turns a single large GEMM into multiple smaller GEMMs whose results must be reduced (summed). This introduces a new axis of parallelism.
+
+1.  **Splitting the K-Dimension**: The `K` dimension of the input matrices `A` and `B` is logically split into `S` parts. The `S` value is chosen by the kernel based on the problem size and hardware characteristics to expose a suitable amount of parallelism.
+
+2.  **Parallel Partial GEMMs**: The `S` partial GEMMs are executed in parallel. The GPU's grid of thread blocks is now two-dimensional, mapping not only to the M and N dimensions of the output matrix `C`, but also to the `S` splits of the K dimension.
+    -   A thread block is assigned to compute a tile of a *partial* product $C_s$.
+
+3.  **Reduction of Partial Results**: The key challenge is how to sum the partial products $C_s$ efficiently.
+    -   **Atomic Add**: The simplest method is for each block to compute its tile of $C_s$ and then use atomic add operations to accumulate its result directly into the final output matrix `C` in global memory. This is easy to implement but can suffer from high contention on the atomic operations, especially if many splits are trying to update the same memory location.
+    -   **Two-Stage Reduction**: A more robust approach involves two stages:
+        -   **Stage 1 (Partial Products)**: Each of the `S` parallel GEMMs writes its full partial product $C_s$ to a temporary workspace in global memory.
+        -   **Stage 2 (Final Reduction)**: A separate reduction kernel is launched to sum the `S` partial products from the workspace into the final output matrix `C`.
+
+Composable Kernel's implementation abstracts this complexity. The `DeviceGemmSplitK` interface handles the selection of the split factor `S`, the launch of the parallel partial GEMMs, and the final reduction step.
+
+## Source Code Organization
+
+-   [`splitk_gemm_xdl.cpp`](./splitk_gemm_xdl.cpp): The main example file. It sets up a standard GEMM problem and instantiates the `DeviceGemmSplitK` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp`](../../include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp): The high-level device interface for the Split-K GEMM. It takes an additional `k_batch` parameter which controls the number of splits.
+-   The underlying grid-wise kernel is modified to accept a `k_batch` index, so that each thread block knows which slice of the `A` and `B` matrices it is responsible for. It also includes the logic for the reduction (e.g., using atomic adds).
+
+## Build and Run
+
+### Prerequisites
+Ensure the Composable Kernel library is built and installed.
+```bash
+cd /path/to/composable_kernel/build
+make -j install
+```
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/35_splitK_gemm
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./splitk_gemm_xdl
+
+# Run with verification, data initialization, and timing
+./splitk_gemm_xdl 1 2 1
+```
+
+## When is Split-K Useful?
+
+Split-K is not always faster than a standard GEMM. It is most beneficial in specific scenarios:
+
+-   **"Skinny" GEMMs**: For GEMMs where `M` and `N` are small but `K` is very large (e.g., `M=64, N=64, K=65536`). A standard GEMM might not generate enough parallel work to fill a large GPU. By splitting the large `K` dimension, we create many more independent work items, improving hardware utilization.
+-   **Limited Shared Memory**: If a standard GEMM requires a very large tile size (and thus a large amount of shared memory) to be efficient, Split-K can be an alternative. It can use smaller tiles for the partial GEMMs, reducing the shared memory footprint per block.
+-   **Load Balancing**: It can help with load balancing on heterogeneous hardware or in complex fused scenarios.
+
+The trade-off is the overhead of the reduction step. The performance gain from increased parallelism must outweigh the cost of either atomic operations or writing and re-reading intermediate results.
--- a/example/36_sparse_embedding/README.md
+++ b/example/36_sparse_embedding/README.md
@@ -0,0 +1,80 @@
+# Sparse Embedding Lookup
+
+This example demonstrates a **sparse embedding lookup**, a fundamental operation in deep learning models that process sparse, high-cardinality categorical features, such as words in a vocabulary or user IDs in a recommendation system. The operation gathers feature vectors from a large embedding table based on a set of sparse input indices.
+
+## Mathematical Formulation
+
+The operation can be described as a lookup or gather operation.
+
+Given:
+-   An **Embedding Table** `W`, a dense 2D tensor of shape `[VocabularySize, EmbeddingDim]`. Each row of `W` is a feature vector (an embedding) for a specific category.
+-   A set of **Indices** `I`, a tensor of integer IDs (e.g., shape `[BatchSize, SequenceLength]`) that specify which embeddings to look up.
+-   An optional **Sparsity-aware Optimizer** state, such as momentum vectors, which must also be looked up and updated.
+
+The operation produces an **Output Tensor** `O` by gathering the rows from `W` corresponding to the indices in `I`.
+$O_{bsj} = W_{I_{bs}, j}$
+
+Where `b` is the batch index, `s` is the sequence index, and `j` is the embedding dimension index. The output tensor `O` will have a shape like `[BatchSize, SequenceLength, EmbeddingDim]`.
+
+## Algorithmic Strategy: Parallel Gather
+
+Unlike compute-bound operations like GEMM, an embedding lookup is almost entirely **memory-bound**. The primary challenge is to perform the gather operation from the potentially very large embedding table `W` as efficiently as possible.
+
+1.  **Grid Scheduling**: The lookup problem is parallelized over the indices. The grid of threads is typically launched to match the shape of the index tensor `I`. Each thread is assigned to handle the lookup for a single index.
+
+2.  **Gather Operation**:
+    -   Each thread reads its assigned index `id = I[b, s]` from the index tensor.
+    -   The thread then calculates the memory address of the start of the corresponding embedding vector in the table `W`. This is typically `address = base_address_W + id * EmbeddingDim * sizeof(DataType)`.
+    -   The thread then reads the entire embedding vector of size `EmbeddingDim` from that address in global memory and writes it to the corresponding position in the output tensor `O`.
+
+3.  **Memory Access Coalescing**: Performance is highly dependent on the memory access patterns.
+    -   If multiple threads in a warp access indices that are close to each other, their memory reads from the embedding table `W` might also be close, leading to some coalescing and better memory bandwidth utilization.
+    -   However, if the indices are random and scattered, the memory accesses will be random, leading to poor cache utilization and low memory bandwidth. This is often the bottleneck.
+
+4.  **Fused Optimizer Update**: In training, the embedding lookup is part of a larger forward-backward-update cycle. For sparse features, only the embedding vectors that were actually used (the "hot" embeddings) need their gradients computed and their weights updated. High-performance implementations often fuse the backward pass (gradient accumulation) and the optimizer step (e.g., SGD or Adam update) for these hot embeddings directly into a specialized kernel to avoid multiple passes over the embedding table. This example focuses on the forward-pass lookup.
+
+## Source Code Organization
+
+-   [`sparse_embedding_xdl.cpp`](./sparse_embedding_xdl.cpp): The main example file. It sets up the embedding table `W`, the index tensor `I`, and instantiates the `DeviceSparseEmbedding` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/device_sparse_embedding.hpp`](../../include/ck/tensor_operation/gpu/device/device_sparse_embedding.hpp): The high-level device interface for the sparse embedding lookup.
+-   The underlying grid-wise kernel is a straightforward gather kernel. Its performance is almost entirely dictated by the efficiency of its memory load and store operations.
+
+## Build and Run
+
+### Prerequisites
+Ensure the Composable Kernel library is built and installed.
+```bash
+cd /path/to/composable_kernel/build
+make -j install
+```
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/36_sparse_embedding
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./sparse_embedding_xdl
+
+# Run with verification, data initialization, and timing
+./sparse_embedding_xdl 1 2 1
+```
+
+## Applications
+
+Embedding layers are the first step in a vast number of deep learning models:
+
+-   **Natural Language Processing (NLP)**: Models like BERT and GPT use embedding layers to convert integer token IDs from a vocabulary into dense vector representations.
+-   **Recommender Systems**: Models use embeddings to represent users and items. The input to the model is often a set of sparse IDs (e.g., user ID, watched movie IDs), which are converted to dense vectors via embedding lookups. Embedding tables in these systems can be enormous (terabytes in size).
+-   **Graph Neural Networks**: Nodes in a graph are often represented by feature vectors, which can be stored in an embedding table and looked up as needed.
+-   **Any model with categorical features**: Whenever a model needs to process non-numeric categorical data (e.g., "product category", "day of the week"), it is typically first converted to an integer ID and then to a dense vector via an embedding layer.
--- a/example/37_batched_gemm_add_add_relu_gemm_add/README.md
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/README.md
@@ -0,0 +1,104 @@
+# Fused Batched GEMM-Add-Add-ReLU-GEMM-Add
+
+This example demonstrates an exceptionally deep and complex fusion, chaining two GEMMs with multiple elementwise additions and a ReLU activation. This pattern is designed to fuse a significant portion of a residual block, such as the feed-forward network (FFN) in a Transformer, into a single, highly optimized kernel.
+
+## Mathematical Formulation
+
+The operation computes a complex chain of operations, batched `B` times. For each batch item `b` from `0` to `B-1`:
+
+1.  **First GEMM (GEMM0)**:
+    $C_{temp1[b]} = A_{[b]} \times B_{[b]}$
+
+2.  **First Add (Add0)**: An elementwise addition with tensor `D0`.
+    $C_{temp2[b]} = C_{temp1[b]} + D0_{[b]}$
+
+3.  **Second Add (Add1)**: Another elementwise addition with tensor `D1`.
+    $C_{temp3[b]} = C_{temp2[b]} + D1_{[b]}$
+
+4.  **Activation (ReLU)**: A Rectified Linear Unit activation is applied.
+    $C_{temp4[b]} = \text{ReLU}(C_{temp3[b]})$
+
+5.  **Second GEMM (GEMM1)**: The result is fed into a second GEMM.
+    $E_{temp[b]} = C_{temp4[b]} \times C_{[b]}$
+
+6.  **Third Add (Add2)**: A final elementwise addition with tensor `D2`.
+    $E_{[b]} = E_{temp[b]} + D2_{[b]}$
+
+The key optimization is that all intermediate tensors ($C_{temp1}$ through $E_{temp}$) are **never written to global memory**. They are produced and consumed entirely within the GPU's on-chip memory (registers and LDS/shared memory).
+
+## Algorithmic Strategy: Deeply Fused Producer-Consumer Chain
+
+This kernel represents a pinnacle of fusion capability. It chains two "producer-consumer" GEMMs together, with a series of elementwise operations fused into the epilogue of the first GEMM.
+
+1.  **Batch Scheduling**: The `B` independent problems are distributed across the GPU's thread blocks. Each thread block is assigned to compute the full chain for one batch item `b`.
+
+2.  **Fused Execution within a Thread Block**:
+    -   **Compute GEMM0 Tile**: The thread block computes a tile of the first GEMM, $A_{[b]} \times B_{[b]}$. The result is held in registers.
+    -   **Fused Epilogue (Add-Add-ReLU)**: Before this intermediate result is stored anywhere, the epilogue operations are applied directly to the data in registers.
+        -   Load corresponding elements from `D0` and `D1`.
+        -   Perform the two additions.
+        -   Apply the ReLU activation.
+    -   **Store to Shared Memory**: The result of this entire fused chain ($C_{temp4}$) is written to a designated region of **shared memory (LDS)**.
+    -   **Synchronization**: A block-wide synchronization (`__syncthreads()`) ensures the intermediate result in LDS is visible to all threads in the block.
+    -   **Compute GEMM1 Tile**: The threads immediately start the second GEMM, using the tile in shared memory as the input, multiplying it with tiles of `C`. The result is accumulated in registers.
+    -   **Final Fused Epilogue (Add)**: Before the final result is stored, the last addition is fused.
+        -   Load corresponding elements from `D2`.
+        -   Perform the final addition in registers.
+    -   **Store Final Result**: The final result `E` is written to global memory.
+
+This deep fusion avoids five separate kernel launches and the associated read/write traffic for four large intermediate tensors, resulting in a massive performance improvement.
+
+## Source Code Organization
+
+-   [`batched_gemm_add_add_relu_gemm_add_xdl.cpp`](./batched_gemm_add_add_relu_gemm_add_xdl.cpp): The main example file. It sets up the numerous input tensors (A, B, C, D0, D1, D2) and instantiates the highly specialized device-level operation.
+-   The device-level interface and underlying grid-wise kernel for this operation are extremely complex, templated on the multiple elementwise operations and managing the intricate data flow between registers, shared memory, and global memory.
+
+## Build and Run
+
+### Prerequisites
+Ensure the Composable Kernel library is built and installed.
+```bash
+cd /path/to/composable_kernel/build
+make -j install
+```
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/37_batched_gemm_add_add_relu_gemm_add
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./batched_gemm_add_add_relu_gemm_add_xdl
+
+# Run with verification, data initialization, and timing
+./batched_gemm_add_add_relu_gemm_add_xdl 1 2 1
+```
+
+## Application to Transformer FFN Block
+
+This kernel can fuse almost the entire Feed-Forward Network (FFN) block of a standard Transformer, including the residual connections.
+
+A typical FFN block with pre-layer-normalization looks like this:
+`Z = LayerNorm(X)`
+`Y = Linear_2(ReLU(Linear_1(Z)))`
+`Output = X + Y`
+
+This kernel can compute `Y` and the final residual addition:
+-   `A`: The normalized input `Z`.
+-   `B`: The weight matrix for `Linear_1`.
+-   `D0`: The bias for `Linear_1`.
+-   `D1`: Not used in this specific mapping (can be zero).
+-   `C`: The weight matrix for `Linear_2`.
+-   `D2`: The bias for `Linear_2` plus the original input `X` for the residual connection.
+
+By mapping the components of a Transformer FFN block to this kernel, a developer can achieve performance far beyond what is possible with a sequence of standard library calls. This demonstrates the power of Composable Kernel to create highly domain-specific, performance-leading fused operations.
--- a/example/38_grouped_conv_bwd_data_multiple_d/README.md
+++ b/example/38_grouped_conv_bwd_data_multiple_d/README.md
@@ -0,0 +1,75 @@
+# Grouped Convolution Backward Data with Multiple Elementwise Inputs
+
+This example demonstrates a **Grouped Convolution Backward Data Pass** fused with an elementwise operation that takes multiple auxiliary input tensors (`D` tensors). The backward data pass (also known as a transposed convolution or deconvolution) computes the gradient of the loss with respect to the convolution's *input* tensor. Fusing it with other operations is a powerful way to optimize the backward pass of a neural network.
+
+## Mathematical Formulation
+
+The operation computes the gradient with respect to the input (`GradIn`) of a grouped convolution, and then fuses the result with other tensors. For each group `g` from `0` to `G-1`:
+
+1.  **Backward Data Convolution Stage**: A standard N-dimensional backward data convolution is performed for the group. This computes the gradient that should be propagated back to the input of the original forward-pass convolution.
+    $GradIn_{temp[g]} = \text{ConvBwdData}(\text{GradOut}_{[g]}, \text{W}_{[g]})$
+    Where `GradOut` is the gradient from the subsequent layer and `W` is the weight tensor from the forward pass.
+
+2.  **Elementwise Stage**: The result of the backward convolution is combined with one or more auxiliary tensors ($D_{0[g]}, D_{1[g]}, \dots$) using a user-defined elementwise function `f`.
+    $GradIn_{[g]} = f(GradIn_{temp[g]}, D_{0[g]}, D_{1[g]}, \dots)$
+
+This fusion is particularly useful for operations like adding the gradient from a residual "skip" connection, which is a common pattern in modern network architectures. By fusing the addition, we avoid a separate kernel launch and a full read/write pass of the `GradIn` tensor.
+
+## Algorithmic Strategy: Implicit Grouped GEMM with Fused Multi-D Epilogue
+
+The implementation uses the implicit GEMM algorithm, but configured for the backward data pass.
+
+1.  **Group Scheduling**: The `G` independent problems are distributed across the GPU's thread blocks. Each thread block is assigned to compute the fused backward convolution for one of the `G` groups.
+
+2.  **Implicit GEMM for Backward Data**: The backward data convolution can be mathematically re-arranged to be equivalent to a forward convolution with transformed inputs and weights, which can then be solved with an implicit GEMM algorithm. Composable Kernel handles this transformation. A thread block executes the implicit GEMM for its assigned group, accumulating the `GradIn_temp` result in registers.
+
+3.  **Fused Multi-D Epilogue**: Before writing the result to global memory, the epilogue performs the elementwise fusion:
+    -   Threads load the corresponding tiles from the auxiliary `D` tensors for the assigned group.
+    -   The user-defined elementwise function `f` is applied in registers to the computed gradient and the `D` tensor values.
+    -   The final result `GradIn` for the group is written to global memory.
+
+This strategy minimizes memory bandwidth by avoiding the materialization of the intermediate gradient tensor and maximizes parallelism by executing all groups concurrently.
+
+## Source Code Organization
+
+-   [`grouped_conv_bwd_data_multiple_d_xdl.cpp`](./grouped_conv_bwd_data_multiple_d_xdl.cpp): The main example file. It sets up the grouped backward convolution problem, including the multiple `D` tensors, and instantiates the `DeviceGroupedConvBwdDataMultipleD` operation.
+-   [`../../include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp`](../../include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp): The high-level device interface for this operation. It takes arrays of tensor descriptors, one for each group for each of the `D` tensors.
+-   The underlying grid-wise kernel contains the logic to map thread blocks to groups and then execute the full implicit GEMM pipeline (formulated for backward data) with the fused multi-D epilogue for the assigned group.
+
+## Build and Run
+
+### Prerequisites
+Ensure the Composable Kernel library is built and installed.
+```bash
+cd /path/to/composable_kernel/build
+make -j install
+```
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/38_grouped_conv_bwd_data_multiple_d
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./grouped_conv_bwd_data_multiple_d_xdl
+
+# Run with verification, data initialization, and timing
+./grouped_conv_bwd_data_multiple_d_xdl 1 2 1
+```
+
+## Applications in Backpropagation
+
+Fusing operations into the backward pass is a critical optimization for training deep neural networks.
+
+-   **Fused Residual Gradient**: In a residual block (`y = F(x) + x`), the gradient with respect to `x` is `dF/dx + dy/dx`. If `F` is a convolution, `dF/dx` is the output of the `ConvBwdData` operation. The `dy/dx` term (the gradient from the skip connection) can be passed as a `D` tensor and fused via an addition, computing the full gradient for `x` in a single kernel.
+-   **Fused Gradient Clipping/Scaling**: The `D` tensors and the elementwise function `f` could be used to apply gradient scaling or other custom gradient processing steps directly to the output of the backward convolution, before the result is written back to memory.
--- a/example/39_permute/README.md
+++ b/example/39_permute/README.md
@@ -0,0 +1,56 @@
+# Tensor Permutation (Dimension Reordering)
+
+## Theory
+
+This example demonstrates **tensor permutation operations**, which reorder the dimensions of tensors according to a specified permutation pattern. Permutation is fundamental for many machine learning operations, including tensor layout transformations, data format conversions, and implementing complex tensor operations.
+
+**Mathematical Formulation:**
+Given an input tensor $X$ with shape $[D_0, D_1, ..., D_{n-1}]$ and a permutation pattern $P = [p_0, p_1, ..., p_{n-1}]$, the permutation operation produces an output tensor $Y$ with shape $[D_{p_0}, D_{p_1}, ..., D_{p_{n-1}}]$ such that:
+$$
+Y_{i_{p_0}, i_{p_1}, ..., i_{p_{n-1}}} = X_{i_0, i_1, ..., i_{n-1}}
+$$
+
+**Algorithmic Background:**
+- Permutation is used for matrix transpose, NCHW/NHWC layout conversion, attention head reshaping, and more.
+- Efficient permutation requires optimizing memory access patterns for coalescing and bandwidth.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/39_permute
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (matrix transpose)
+./permute_xdl --input_shape=4096,4096 --permutation=1,0 --verify=1 --time=1
+
+# Example run (NCHW to NHWC)
+./permute_xdl --input_shape=32,256,56,56 --permutation=0,2,3,1 --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/39_permute/
+├── permute_xdl.cpp         # Main example: sets up, runs, and verifies tensor permutation
+include/ck/tensor_operation/gpu/device/
+│   └── device_permute.hpp       # Device-level permutation API
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_permute.hpp     # Grid-level permutation kernel
+```
+
+### Key Classes and Functions
+
+- **DevicePermute** (in `device_permute.hpp`):  
+  Device API for tensor permutation.
+- **gridwise_permute** (in `gridwise_permute.hpp`):  
+  Implements the tiled/blocking permutation kernel.
+
+This example demonstrates how Composable Kernel implements efficient tensor dimension reordering for layout transformations and deep learning operations.
--- a/example/40_conv2d_fwd_quantization/README.md
+++ b/example/40_conv2d_fwd_quantization/README.md
@@ -0,0 +1,61 @@
+# 2D Convolution Forward with Quantization
+
+## Theory
+
+This example demonstrates **2D convolution forward with quantized weights or activations**. Quantization is used to reduce memory and computation by representing values with lower-precision integer types (e.g., int8), enabling efficient inference in deep learning.
+
+**Mathematical Formulation:**
+- Quantized convolution: $Y = \text{dequant}(X_q) * \text{dequant}(W_q)$
+- $X_q$, $W_q$: quantized input and weight tensors (e.g., int8)
+- $\text{dequant}(x_q) = (x_q - z) \cdot s$ (scale $s$, zero-point $z$)
+- $Y$: output tensor (often in higher precision, e.g., float32 or float16)
+
+**Algorithmic Background:**
+- Quantized values are dequantized on-the-fly during convolution.
+- Accumulation is performed in higher precision for accuracy.
+- Supports symmetric and asymmetric quantization.
+- Convolution is implemented as implicit GEMM for efficiency.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/40_conv2d_fwd_quantization
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./conv2d_fwd_quantization_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/40_conv2d_fwd_quantization/
+├── conv2d_fwd_quantization_xdl.cpp         # Main example: sets up, runs, and verifies quantized conv2d
+include/ck/tensor_operation/gpu/device/
+│   └── device_conv2d_fwd_quantization.hpp       # Device-level quantized conv2d API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_conv2d_fwd_quantization_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_conv2d_fwd_quantization.hpp     # Grid-level quantized conv2d kernel
+include/ck/tensor_operation/gpu/element/
+    └── quantization_operations.hpp              # Quantization/dequantization utilities
+```
+
+### Key Classes and Functions
+
+- **DeviceConv2dFwdQuantization** (in `device_conv2d_fwd_quantization.hpp`):  
+  Device API for quantized 2D convolution.
+- **gridwise_conv2d_fwd_quantization** (in `gridwise_conv2d_fwd_quantization.hpp`):  
+  Implements the tiled/blocking quantized conv2d kernel.
+- **quantization_operations** (in `quantization_operations.hpp`):  
+  Defines quantization and dequantization functions.
+
+This example demonstrates how Composable Kernel supports efficient quantized convolution for deep learning inference.
--- a/Show More
+++ b/Show More