Merge branch 'develop' into hstu_attention_mi350_fwd_bwd

2026-05-21 05:19:20 +00:00 · 2025-11-23 04:20:53 +00:00
parent b75077475b 02ab76c2cb
commit 4f33eb5857
3118 changed files with 208508 additions and 42460 deletions
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -44,8 +44,7 @@ list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllv
 example_compile_options(example_gemm_xdl_fp8_v3 PRIVATE ${GEMM_OPTIONS})
 example_compile_options(example_gemm_xdl_bf16_v3 PRIVATE ${GEMM_OPTIONS})

-
-list(APPEND gpu_list gfx942 gfx950)
+list(APPEND gpu_list gfx942 gfx950 gfx1200 gfx1201 gfx12-generic)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list AND target EQUAL 0)
@@ -89,7 +88,14 @@ foreach(gpu IN LISTS GPU_TARGETS)

        add_example_executable(example_gemm_xdl_lds_direct_load_fp16 gemm_xdl_lds_direct_load_fp16.cpp)
        add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp16)
+        set(target 1)
+    endif()
+endforeach()

+list(APPEND gpu_list gfx90a gfx942 gfx950 gfx1200 gfx1201 gfx12-generic)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list AND target EQUAL 0)
        add_example_executable(example_gemm_xdl_bf16_streamk_v3 gemm_xdl_bf16_streamk_v3.cpp)
        add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_streamk_v3)

@@ -99,6 +105,16 @@ foreach(gpu IN LISTS GPU_TARGETS)
    endif()
 endforeach()

+list(APPEND gpu_list_tf32 gfx942 gfx950)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
+        add_example_executable(example_gemm_xdl_lds_direct_load_fp32_tf32 gemm_xdl_lds_direct_load_fp32_tf32.cpp)
+        add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp32_tf32)
+        set(target 1)
+    endif()
+endforeach()
+
 add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)

--- a/example/01_gemm/README.md
+++ b/example/01_gemm/README.md
@@ -1,27 +1,221 @@
-# Instructions for ```example_gemm_xdl```
+[Back to supported operations](../../../include/ck/README.md)
+# Composable Kernel GEMM Example
+
+## Introduction
+
+GEMM (General Matrix Multiplication) is a fundamental operation in linear algebra and deep learning. It computes the product of two matrices, optionally adds a bias or residual, and is the core of many neural network layers (MLPs, attention, convolutions via im2col). This example demonstrates the flexible and high-performance GEMM API provided by Composable Kernel.
+
+---
+
+## Theory
+
+**Mathematical Formulation:**
+$$
+C = \alpha (A \times B) + \beta D
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $D$: [M, N] optional bias/residual
+- $C$: [M, N] output
+- $\alpha, \beta$: scalars (often 1.0, 0.0)
+
+GEMM is implemented using a tiled/blocking strategy to maximize data reuse and memory bandwidth. Modern GPU implementations use matrix core/XDL/MFMA instructions for high throughput. The operation is the computational backbone for transformer attention, MLPs, CNNs (via lowering), and more.
+
+---
+
+## CK GEMM API Overview
+
+CK provides a highly composable GEMM API via the `DeviceGemm` family of device operations. These are highly templated to support a wide range of data types, layouts, and fused operations.
+
+### Template Parameters
+
+- **ALayout** - A matrix layout (RowMajor/ColumnMajor)
+- **BLayout** - B matrix layout (RowMajor/ColumnMajor)
+- **CLayout** - C matrix layout (RowMajor/ColumnMajor)
+- **ADataType** - A matrix data type
+- **BDataType** - B matrix data type
+- **CDataType** - C matrix data type
+- **AElementwiseOperation** - Fused operation on tensor A before GEMM
+- **BElementwiseOperation** - Fused operation on tensor B before GEMM
+- **CElementwiseOperation** - Fused operation on tensor C after GEMM
+
+For large K dimension, use `DeviceGemmSplitK` to split K across workgroups (requires zeroing output buffer due to use of AtomicAdd).
+
+For fused operations with additional tensors, use `DeviceGemmMultipleABD` or `DeviceGemmMultipleD`:
+- **DsLayout** - layouts for additional tensors
+- **DsDataType** - data types for additional tensors
+
+For `DeviceGemmMultipleABD`, pass **ALayout**, **BLayout**, **ADataType**, **BDataType** as tuples.
+
+---
+
+## Supported GEMM Variants
+
+- **DeviceGemm**: Standard GEMM
+- **DeviceGemmSplitK**: Split-K GEMM for large K
+- **DeviceGemmMultipleABD**: Fused GEMM with multiple A/B/D tensors
+- **DeviceGemmMultipleD**: Fused GEMM with multiple D tensors
+
+---
+
+## Supported Device Operations
+
+- **DeviceGemmDl**: DL instructions
+- **DeviceGemmDpp**: DL instructions with DPP during data load
+- **DeviceGemmWmma_CShuffle**: WMMA instructions with CShuffle optimization
+- **DeviceGemm_Xdl_CShuffle_LdsDirectLoad**: XDL instructions, CShuffle, direct global-to-shared load
+- **DeviceGemm_Xdl_CShuffle**: XDL instructions with CShuffle
+- **DeviceGemm_Xdl_CShuffleV2**: XDL instructions, optimized pipeline vs. V1
+- **DeviceGemmXdlSkipBLds**: XDL, skips shared memory load for B
+- **DeviceGemm_Xdl_WaveletModel_CShuffle**: XDL, CShuffle, wavelet producer/consumer
+- **DeviceGemmXdl**: XDL instructions
+
+---
+
+## Supported Data Types and Layouts
+
+### XDL Instruction
+
+|       |Is supported|
+|-------|---|
+|bf16   |✔️|
+|fp16   |✔️|
+|fp32   |✔️|
+|int8   |✔️|
+|fp8    |✔️|
+
+### WMMA Instruction
+
+|       |Is supported|
+|-------|---|
+|bf16   |✔️|
+|fp16   |✔️|
+|fp32   |❌|
+|int8   |✔️|
+|fp8    |❌|
+
+### DL Instruction
+
+|       |Is supported|
+|-------|---|
+|bf16   |❌|
+|fp16   |✔️|
+|fp32   |✔️|
+|int8   |✔️|
+|fp8    |❌|
+
+---
+
+## Supported Fused Elementwise Operations
+
+- **B Matrix Multiply + Add + Gelu** - bf16 (int8 for B matrix)
+- **B Matrix Multiply + Add** - bf16 (int8 for B matrix)
+- **B Matrix Multiply + Gelu** - bf16 (int8 for B matrix)
+- **B Matrix Multiply** - bf16 (int8 for B matrix)
+- **Add + Add + Gelu** - fp16
+- **Add + Gelu** - fp16, bf16 (int8 for B matrix) for Row/Column/Row
+- **Multiply** - fp16
+- **Add + Multiply** - fp16
+- **Add + Relu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
+- **Add + Silu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
+- **Add** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
+- **Bilinear** - fp16, int8
+- **Gelu** - fp16
+- **Multiply + Add** - fp16 for Row/Column/Row and Row/Row/Row, fp16 (int8 for B matrix, fp32 for Bias) for Row/Column/Row and Row/Row/Row
+- **Quantization** - int8
+
+---
+
+## GEMM V2 (Universal GEMM)
+
+Optimized for MI300 series. Operation is called as `DeviceGemmV2` and uses similar template parameters as above.
+
+- **ALayout**, **BLayout**, **CLayout**
+- **ADataType**, **BDataType**, **CDataType**
+- **AElementwiseOperation**, **BElementwiseOperation**, **CElementwiseOperation**
+
+Split-K is supported (requires zeroing output buffer if splitK > 1).
+
+### Device Operations
+
+- **DeviceGemm_Xdl_CShuffleV3**: XDL with CShuffle optimization
+- **DeviceGemm_Xdl_CShuffleV3R1**: XDL with CShuffle, reduction on split-K after GEMM
+
+### Supported Types
+
+|       |Is supported|
+|-------|---|
+|bf16   |✔️|
+|fp16   |✔️|
+|fp32   |❌|
+|int8   |❌|
+|fp8 (C bf16)|✔️|
+|fp16 (A fp8)|✔️|
+|fp16 (B fp8)|✔️|
+
+---
+
+## Other GEMM Extensions
+
+- **DeviceGemm_dequantB**: GEMM with dequantization (WMMA)
+- **DeviceGemmMultipleD_ABScale**: GEMM with scale for A and B
+- **DeviceGemmMultipleDLayernorm**: GEMM fused with layernorm
+- **DeviceGemmMultipleDMultipleR**: GEMM fused with reductions and custom global reductions
+- **DeviceGemmReduce**: GEMM fused with reduction
+- **DeviceGemm_Streamk_V2**: Stream K with reduction instead of AtomicAdd
+- **DeviceGemmStreamK**: Stream K using AtomicAdd
+
+---
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run

-## Run ```example_gemm_xdl```
 ```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-./bin/example_gemm_xdl 0 1 5
+cd composable_kernel/example/01_gemm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run (FP16)
+./gemm_xdl_fp16 -M 4096 -N 4096 -K 4096 -v 1 -t 1
 ```

-# Instructions for ```example_gemm_xdl_fp16_streamk_v3```
+---
+
+## Source Code Structure

-## Run ```example_gemm_xdl_fp16_streamk_v3```
-```bash
-arg1: verification (0=no, 1=yes)
-arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-arg3: time kernel (0=no, 1=yes)
-arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
-arg10: stream-k select (-1: default config, 0: all DP, 1: 1-tile SK, 2: 2-tile SK)
-arg11: Grid_size(-1 for max occupancy)
-bin/example_gemm_xdl_fp16_streamk_v3 1 2 1 3840 4096 4096 4096 4096 4096 1 -1
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-problem {M:3840, N:4096, K:4096, SA:4096, SB:4096, SC:4096, MP:4032, NP:4096, KRead:4096, KP:4096, AK0:512, BK0:2048, MBlock: 18, NBlock: 16, Stream-K Selection:1, Grid size:-1}
-Perf: 0.292022 ms, 441.23 TFlops, 330.348 GB/s, DeviceGemmXdlUniversal<MNPadding, RRR> BlkSize: 256, BlkTile: 224x256x64, WaveTile: 16x16, WaveMap: 7x8, VmemReadVec: 8x8, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2
 ```
+example/01_gemm/
+├── gemm_xdl_fp16.cpp         # Main example: sets up, runs, and verifies GEMM (FP16)
+├── gemm_xdl_fp32.cpp         # Main example: FP32 variant
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm.hpp       # Device-level GEMM API (templated)
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_xdl.hpp   # XDL-based GEMM implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_xdl.hpp # Grid-level tiled GEMM kernel
+include/ck/tensor_operation/gpu/block/
+│   └── blockwise_gemm_xdl.hpp # Block-level tiled GEMM
+library/reference_tensor_operation/cpu/
+    └── reference_gemm.hpp    # CPU reference GEMM for correctness checking
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmXdl** (in `device_gemm.hpp`):  
+  Main device API for launching GEMM kernels.  
+- **GridwiseGemmXdl** (in `gridwise_gemm_xdl.hpp`):  
+  Implements the tiled/blocking GEMM kernel for the GPU grid.
+- **BlockwiseGemmXdl** (in `blockwise_gemm_xdl.hpp`):  
+  Handles block-level computation and shared memory tiling.
+- **reference_gemm** (in `reference_gemm.hpp`):  
+  CPU implementation for result verification.
+
+---
+
+This example is the foundation for all matrix operations in Composable Kernel and is the basis for more advanced fused and batched operations.
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -25,6 +25,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;

 struct ProblemSize final
 {
@@ -310,10 +315,14 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
    return true;
 }

-template <typename DataType>
+template <typename DataType, typename ComputeDataType = DataType>
 inline __host__ __device__ constexpr double get_rtol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
    {
        return 1e-3;
    }
@@ -351,10 +360,14 @@ inline __host__ __device__ constexpr double get_rtol()
    }
 }

-template <typename DataType>
+template <typename DataType, typename ComputeDataType = DataType>
 inline __host__ __device__ constexpr double get_atol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
    {
        return 1e-3;
    }
--- a/example/01_gemm/gemm_wmma_fp16_v3.cpp
+++ b/example/01_gemm/gemm_wmma_fp16_v3.cpp
@@ -26,17 +26,18 @@ using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuf
    ALayout, BLayout, CLayout,
    ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
    PassThrough, PassThrough, PassThrough, GemmDefault,
-    128,
-    128, 64,
-    64, 8, 8,
+    256,
+    128, 256, 64,
+    8, 8,
    16, 16,
-    4, 2,
-    S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    2, 8,
+    S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
    1, 1, 8, 1,
-    S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
    1, 1, 8, 1,
-    1, 1, S<1, 32, 1, 4>, 8,
-    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3>;
+    1, 1,
+    S<1, 64, 1, 4>, 8,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::
--- a/example/01_gemm/gemm_wmma_fp8_v3.cpp
+++ b/example/01_gemm/gemm_wmma_fp8_v3.cpp
@@ -13,7 +13,7 @@ using CDataType        = ck::bhalf_t;
 using ComputeTypeA     = ck::f8_t;
 using ComputeTypeB     = ck::f8_t;

-using ALayout = Row;
+using ALayout = Col;
 using BLayout = Col;
 using CLayout = Row;

@@ -30,13 +30,13 @@ using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuf
    PassThrough, PassThrough, PassThrough, GemmDefault,
    128,
    128, 64, 64,
-    8, 8,
+    16, 16, // AK1, BK1
    16, 16,
    4, 2,
+    S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    1, 4, 16, 0,
    S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
-    2, 8, 8, 0,
-    S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
-    2, 8, 8, 0,
+    2, 16, 16, 0,
    1, 1, S<1, 32, 1, 4>, 8,
    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1,
    ComputeTypeA, ComputeTypeB>;
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -27,7 +27,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::
--- a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -199,9 +199,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -37,7 +37,7 @@ using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffl
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           2,              S<1, 16, 1, 16>,               8, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           2,              S<1, 16, 1, 16>,               4, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
 // clang-format on

 using DeviceGemmInstance = DeviceGemmInstance1;
--- a/example/01_gemm/gemm_xdl_fp16_fp8.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -30,7 +30,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector| Scheduler|     Version|             |
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|          |            |             |
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |          |            |             |
-        < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,  LoopSched, PipelineVer, ComputeType>;
+        < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4,  LoopSched, PipelineVer, ComputeType>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -249,9 +249,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -38,14 +38,14 @@ using DeviceGemmV2Instance =
        AElementOp, BElementOp, CElementOp, GemmDefault, 
        256, Scale_Block_N, Scale_Block_K,
        128, 128,
-        KPerBlock, 8, 32,
-        32,   32,
-        4,    1,
+        KPerBlock, 8, 16,
+        16,   16,
+        8,    2,
        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
        2, 8, 8, 0,
        S<2, 128, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
-        2, 32, 32, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        2, 16, 16, 0,
+        1, 1, S<1, 16, 1, 16>, 4,
        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, CDataType, CDataType, PermuteA, PermuteB>;

 // clang-format on
@@ -281,9 +281,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx11_supported() || ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp16_v2.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_v2.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -33,13 +33,13 @@ using DeviceGemmInstance =
        2,   256,
        256, 256, 
        32, 8, 4,
-        32,   32,
-        4,    4, 
+        16,   16,
+        8,    8, 
        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
        2, 8, 8, 0,
        S<8, 32, 1>,  S<0, 2, 1>,  S<0, 2, 1>,
        1, 8, 4, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        1, 1, S<1, 32, 1, 8>, 4,
        ck::LoopScheduler::Default, ck::PipelineVersion::v1>;
 // clang-format on

--- a/example/01_gemm/gemm_xdl_fp8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -31,7 +31,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|  Scheduler|     Version|        TypeA|        TypeB|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|           |            |             |             |
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |           |            |             |             |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8,  LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4,  LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
    // this instance has been tested working on gfx950     
    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    128,  32,  32,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8,  LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
 // clang-format on
@@ -55,4 +55,12 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa

 #include "run_gemm_example.inc"

-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+
+    return !run_gemm_example(argc, argv);
+}
--- a/example/01_gemm/gemm_xdl_fp8_bf8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_bf8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -31,7 +31,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|  Scheduler|     Version|        TypeA|        TypeB|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|           |            |             |             |
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |           |            |             |             |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8,  LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4,  LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
@@ -57,4 +57,12 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa

 #include "run_gemm_example.inc"

-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+
+    return !run_gemm_example(argc, argv);
+}
--- a/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
@@ -28,7 +28,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa

 static constexpr bool PermuteA = false;
 static constexpr bool PermuteB = false;
-
+static constexpr int KPack     = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
 // clang-format off
 #if 0
 using DeviceGemmV2Instance = 
@@ -56,14 +56,14 @@ using DeviceGemmV2Instance =
        AElementOp, BElementOp, CElementOp, GemmDefault, 
        256,
        256, 256,
-        128, 16, 32,
-        32,   32,
-        4,    4,
+        128, 16, KPack,
+        16,   16,
+        8,    8,
        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
        2, 16, 16, 0,
        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
        2, 32, 32, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        1, 1, S<1, 32, 1, 8>, 4,
        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, F8, F8, PermuteA, PermuteB>;

 #endif
@@ -160,7 +160,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    auto gemm = DeviceGemmV2Instance{};

    // weight pre-shuffle
-    int KPack = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
    int NLane = gemm.GetPreShuffleParameters();
    int KLane = 64 / NLane;

@@ -269,9 +268,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -38,14 +38,14 @@ using DeviceGemmV2Instance =
        AElementOp, BElementOp, CElementOp, GemmDefault, 
        256,
        128, 128,
-        KPerBlock, 16, 32,
-        32,   32,
-        2,    2,
+        KPerBlock, 16, 16,
+        16,   16,
+        4,    4,
        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
        2, 16, 16, 0,
        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
-        2, 32, 32, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        2, 16, 16, 0,
+        1, 1, S<1, 32, 1, 8>, 4,
        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2, ADataType, ADataType, PermuteA, PermuteB>;

 // clang-format on
@@ -247,9 +247,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        return true;
    }

-    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
+         ck::is_gfx12_supported()))
    {
-        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+        std::cout << "This kernel support gfx942, gfx950 and gfx12 only" << std::endl;

        return true;
    }
--- a/example/01_gemm/gemm_xdl_fp8_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -36,7 +36,7 @@ using DeviceGemmV2Instance =
        2, 16, 16, 0,
        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
        2, 16, 16, 0,
-        1, 2, S<1, 32, 1, 8>, 8,
+        1, 2, S<1, 32, 1, 8>, 4,
        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ck::f8_t>;
 // clang-format on

--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -27,7 +27,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp32_tf32.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp32_tf32.cpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "common.hpp"
+
+#define USING_DIRECT_LOADS 1
+#if USING_DIRECT_LOADS
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp"
+#else
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#endif
+
+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
+using F32 = float;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F32;
+using ComputeDataType  = ck::tf32_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+#if USING_DIRECT_LOADS
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer|
+// ######| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockLds|
+// ######| CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| LoopScheduler      |  pipeline ver           | gemm type  |
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|
+// ######| XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,
+           8,   8,   32,   32,    2,    2,      S<4, 8, 8>,     S<1, 0, 2>,              2,              1,         1,      S<4, 8, 8>,     S<1, 0, 2>,             2,              1,         1,
+           1,           1,               S<1, 8, 1, 8>,               4,   ck::LoopScheduler::Default, ck::PipelineVersion::v4, ComputeDataType>;
+// clang-format on
+#else
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 8, 1, 8>,               4>;
+// clang-format on
+#endif
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CElementOp,
+                                                                        ComputeDataType,
+                                                                        ComputeDataType>;
+
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
--- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -36,7 +36,7 @@ using BDataType   = ck::half_t;
 using CDataType   = ck::half_t;
 using AccDataType = float;
 #else  
-                    <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    16,   64,     4,  4,   16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,             4,      4,        7,               1>;
+                    <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    16,   128,     4,  4,   16,   16,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,             4,      4,        7,               1>;
 using ADataType   = float;
 using BDataType   = float;
 using CDataType   = float;
@@ -185,7 +185,6 @@ int main(int argc, char* argv[])
    auto a_element_op = AElementOp{};
    auto b_element_op = BElementOp{};
    auto c_element_op = CElementOp{};
-
    // do GEMM
    auto gemm     = DeviceGemmInstance{};
    auto invoker  = gemm.MakeInvoker();
@@ -209,8 +208,7 @@ int main(int argc, char* argv[])
        return 0;
    }

-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-
+    float ave_time   = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_btype =
        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
--- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -29,7 +29,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_WaveletM
 // ######|        |        |        |      Type|      Type|        Type|         DataType|      Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| ThreadGroupSize| ThreadGroupSize| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |            |                 |          |   Operation|   Operation|   Operation|               |    Stage|                |                |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |            |                 |          |            |            |            |               |         |                |                |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType,              F16, CDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,             256,             256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,                S<1, 32, 1,8>,               8>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType,              F16, CDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,             256,             256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,                S<1, 32, 1,8>,               4>;
 // clang-format on

 using DeviceGemmInstance = DeviceGemmInstance;
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -2,7 +2,11 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-#include "ck/library/utility/validation_common.hpp"
+
+// use macro to minimize code change
+#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
+using ComputeDataType = AccDataType;
+#endif

 template <typename ProblemType>
 bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
@@ -24,11 +28,11 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
            }
        };

@@ -54,17 +58,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});

-    try
-    {
-        ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
-            M, N, K, StrideA, StrideB, StrideC);
-    }
-    catch(const std::runtime_error& e)
-    {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return false;
-    }
-
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));

@@ -218,8 +211,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        pass &= ck::utils::check_err(c_m_n_device_result,
                                     c_m_n_host_result,
                                     "Error: Incorrect results!",
-                                     get_rtol<CDataType>(),
-                                     get_atol<CDataType>());
+                                     get_rtol<CDataType, ComputeDataType>(),
+                                     get_atol<CDataType, ComputeDataType>());
 #endif
    }

@@ -249,8 +242,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        pass &= ck::utils::check_err(c_m_n_device_result,
                                     c_m_n_device_ref_result,
                                     "Error: Incorrect results!",
-                                     get_rtol<CDataType>(),
-                                     get_atol<CDataType>());
+                                     get_rtol<CDataType, ComputeDataType>(),
+                                     get_atol<CDataType, ComputeDataType>());
    }

    return pass == true;
--- a/example/02_gemm_bilinear/README.md
+++ b/example/02_gemm_bilinear/README.md
@@ -1,6 +1,78 @@
-# Instructions for ```example_gemm_bilinear_xdl_fp16```
+# Composable Kernel GEMM Bilinear Example
+
+## Introduction
+
+This example demonstrates GEMM (General Matrix Multiplication) fused with bilinear operations on auxiliary tensors using Composable Kernel. Bilinear fusion patterns are widely used in neural networks for gating, attention, and multimodal feature fusion, where the output of a matrix multiplication is combined elementwise with one or more additional tensors.
+
+---
+
+## Theory
+
+**Mathematical Formulation:**
+$$
+F = \text{BilinearOp}(A \times B, D, E)
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $D$, $E$: [M, N] auxiliary tensors (or broadcastable)
+- $F$: [M, N] output
+
+**Examples:**
+- Elementwise: $F = (A \times B) \odot D \odot E$
+- Gated: $F = (A \times B) \odot \sigma(D) + E$
+- Weighted: $F = \alpha (A \times B) + \beta (D \odot E)$
+
+The GEMM result is kept in registers and combined with auxiliary tensors in the epilogue, avoiding intermediate writes to global memory. This pattern is common in attention, gating, and feature interaction layers.
+
+---
+
+## CK GEMM Bilinear API Overview
+
+CK provides a composable API for GEMM with multiple auxiliary tensors via the `DeviceGemmMultipleD` operation.
+
+### Template Parameters
+
+- **ALayout** - A matrix layout (RowMajor/ColumnMajor)
+- **BLayout** - B matrix layout (RowMajor/ColumnMajor)
+- **DsLayout** - Layouts for auxiliary tensors (tuple)
+- **ELayout** - Output matrix layout (RowMajor/ColumnMajor)
+- **ADataType** - A matrix data type
+- **BDataType** - B matrix data type
+- **DsDataType** - Data types for auxiliary tensors (tuple)
+- **EDataType** - Output matrix data type
+- **AElementwiseOperation** - Fused operation on tensor A before GEMM
+- **BElementwiseOperation** - Fused operation on tensor B before GEMM
+- **CDEElementwiseOperation** - Fused operation on C, D, E after GEMM
+
+### Supported Data Types and Layouts
+
+- Supports fp16, int8, and other types depending on the device operation.
+- Supports RowMajor and ColumnMajor layouts for all tensors.
+
+### Supported Device Operations
+
+- **DeviceGemmMultipleD**: Standard multi-tensor GEMM
+- **DeviceGemmMultipleD_Bilinear**: GEMM with bilinear fusion in the epilogue
+
+---
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+
+```bash
+cd composable_kernel/example/02_gemm_bilinear
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+```
+### Run ```example_gemm_bilinear_xdl_fp16```

-## Run ```example_gemm_bilinear_xdl_fp16```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -9,3 +81,35 @@
 #arg11 to 12: alpha, beta
 ./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5
 ```
+
+---
+
+## Source Code Structure
+
+```
+example/02_gemm_bilinear/
+├── gemm_bilinear_xdl.cpp         # Main example: sets up, runs, and verifies GEMM with bilinear fusion
+├── gemm_bilinear_wmma_fp16.cpp   # WMMA FP16 variant
+├── gemm_bilinear_wmma_int8.cpp   # WMMA int8 variant
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_multiple_d.hpp       # Device-level API for multi-tensor GEMM
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_bilinear_impl.hpp    # Bilinear operation implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_multiple_d.hpp     # Grid-level multi-tensor GEMM kernel
+include/ck/tensor_operation/gpu/element/
+    └── element_wise_operation.hpp       # Elementwise operation definitions
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmMultipleD** (in `device_gemm_multiple_d.hpp`):  
+  Device API for GEMM with multiple auxiliary tensors and fused epilogues.
+- **gridwise_gemm_multiple_d** (in `gridwise_gemm_multiple_d.hpp`):  
+  Implements the tiled/blocking GEMM kernel with multi-tensor epilogue.
+- **element_wise_operation** (in `element_wise_operation.hpp`):  
+  Defines bilinear and other elementwise operations.
+
+---
+
+This example demonstrates how Composable Kernel supports complex multi-tensor fusion patterns for advanced neural network architectures.
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/check_err.hpp"
 #include "ck/host_utility/device_prop.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 struct AlphaBetaAdd
 {
    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
@@ -43,8 +47,9 @@ using S = ck::Sequence<Is...>;
 using F16 = ck::half_t;
 using F32 = float;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -190,11 +195,11 @@ int main(int argc, char* argv[])

            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/check_err.hpp"
 #include "ck/host_utility/device_prop.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 struct AlphaBetaAdd
 {
    AlphaBetaAdd(int alpha, int beta) : alpha_(alpha), beta_(beta){};
@@ -43,8 +47,9 @@ using S = ck::Sequence<Is...>;
 using I8  = std::int8_t;
 using I32 = std::int32_t;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -190,11 +195,11 @@ int main(int argc, char* argv[])

            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -18,6 +18,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 struct AlphaBetaAdd
 {
    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
@@ -42,8 +46,9 @@ using S = ck::Sequence<Is...>;
 using F16 = ck::half_t;
 using F32 = float;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -87,10 +92,10 @@ using DeviceOpInstance =
                                                                   32,
                                                                   8,
                                                                   8,
-                                                                   32,
-                                                                   32,
+                                                                   16,
+                                                                   16,
+                                                                   8,
                                                                   4,
-                                                                   2,
                                                                   S<4, 64, 1>,
                                                                   S<1, 0, 2>,
                                                                   S<1, 0, 2>,
@@ -108,7 +113,7 @@ using DeviceOpInstance =
                                                                   1,
                                                                   1,
                                                                   S<1, 32, 1, 8>,
-                                                                   8>;
+                                                                   4>;

 int main(int argc, char* argv[])
 {
@@ -173,7 +178,7 @@ int main(int argc, char* argv[])
        printf("arg3: time kernel (0=no, 1=yes)\n");
        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
               "beta\n");
-        exit(0);
+        exit(1);
    }

    auto f_host_tensor_descriptor =
@@ -182,11 +187,11 @@ int main(int argc, char* argv[])

            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

--- a/example/03_gemm_bias_relu/README.md
+++ b/example/03_gemm_bias_relu/README.md
@@ -1,10 +1,63 @@
-# Instructions for ```example_gemm_bias_relu_xdl_fp16```
+# GEMM with Bias and ReLU Activation Fusion

-## Run ```example_gemm_bias_relu_xdl_fp16```
+## Theory
+
+This example demonstrates **GEMM fused with bias addition and ReLU activation**. This is the core pattern for fully connected (dense) neural network layers and the feed-forward blocks in transformers.
+
+**Mathematical Formulation:**
+$$
+E = \text{ReLU}(A \times B + \text{bias})
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $\text{bias}$: [N] bias vector (broadcasted)
+- $E$: [M, N] output
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, bias is added, and ReLU is applied before writing to global memory.
+- This fusion eliminates intermediate memory traffic and is a standard optimization in deep learning frameworks.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
 ```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: time kernel (0=no, 1=yes)
-#arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE
-./bin/example_gemm_bias_relu_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096
+cd composable_kernel/example/03_gemm_bias_relu
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_bias_relu_xdl -M 2048 -N 8192 -K 2048 --verify=1 --time=1
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/03_gemm_bias_relu/
+├── gemm_bias_relu_xdl.cpp         # Main example: sets up, runs, and verifies GEMM+Bias+ReLU
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_multiple_d.hpp         # Device-level API for multi-tensor GEMM
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_xdl_cshuffle_v3.hpp    # XDL with C-Shuffle epilogue
+│   └── device_gemm_bias_relu_impl.hpp     # Specialized bias+ReLU implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_xdl_cshuffle.hpp     # Grid-level GEMM with epilogue
+include/ck/tensor_operation/gpu/element/
+    └── element_wise_operation.hpp         # Elementwise operation definitions
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmMultipleD** (in `device_gemm_multiple_d.hpp`):  
+  Device API for GEMM with auxiliary tensors and fused epilogues.
+- **gridwise_gemm_xdl_cshuffle** (in `gridwise_gemm_xdl_cshuffle.hpp`):  
+  Implements the tiled/blocking GEMM kernel with fused epilogue.
+- **element_wise_operation** (in `element_wise_operation.hpp`):  
+  Defines bias addition and ReLU activation.
+
+This example demonstrates the standard epilogue fusion concept that enables efficient neural network layers in modern deep learning.
--- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -19,14 +19,19 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

 using F16 = ck::half_t;
 using F32 = float;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -83,10 +88,10 @@ using DeviceOpInstance =
                                                                   32,
                                                                   8,
                                                                   8,
-                                                                   32,
-                                                                   32,
+                                                                   16,
+                                                                   16,
+                                                                   8,
                                                                   4,
-                                                                   2,
                                                                   S<4, 64, 1>,
                                                                   S<1, 0, 2>,
                                                                   S<1, 0, 2>,
@@ -104,7 +109,7 @@ using DeviceOpInstance =
                                                                   1,
                                                                   1,
                                                                   S<1, 32, 1, 8>,
-                                                                   8>;
+                                                                   4>;

 int main(int argc, char* argv[])
 {
@@ -113,13 +118,13 @@ int main(int argc, char* argv[])
    bool time_kernel     = false;

    // GEMM shape
-    ck::index_t M = 3840;
-    ck::index_t N = 4096;
-    ck::index_t K = 4096;
+    ck::index_t M = 1920;
+    ck::index_t N = 2048;
+    ck::index_t K = 2048;

-    ck::index_t StrideA = 4096;
-    ck::index_t StrideB = 4096;
-    ck::index_t StrideE = 4096;
+    ck::index_t StrideA = 2048;
+    ck::index_t StrideB = 2048;
+    ck::index_t StrideE = 2048;

    if(argc == 1)
    {
@@ -160,17 +165,19 @@ int main(int argc, char* argv[])

            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

+    ck::index_t StrideD = 0;
+
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, 0, ELayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, ELayout{}));
    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));

@@ -221,7 +228,7 @@ int main(int argc, char* argv[])
                               K,
                               StrideA,
                               StrideB,
-                               std::array<ck::index_t, 1>{0},
+                               std::array<ck::index_t, 1>{static_cast<int>(StrideD)},
                               StrideE,
                               a_element_op,
                               b_element_op,
--- a/example/04_gemm_add_add_fastgelu/README.md
+++ b/example/04_gemm_add_add_fastgelu/README.md
@@ -1,10 +1,70 @@
-# Instructions for ```example_gemm_add_add_fastgelu_xdl_fp16```
+# GEMM with Add, Add, and FastGELU Activation

-## Run ```example_gemm_add_add_fastgelu_xdl_fp16```
+## Theory
+
+This example demonstrates a **GEMM operation fused with two addition operations and FastGELU activation**. This pattern is used in transformer feed-forward networks and other neural architectures where a linear transformation is followed by bias addition, residual addition, and a non-linear activation.
+
+**Mathematical Formulation:**
+$$
+E = \text{FastGELU}((A \times B) + D_0 + D_1)
+$$
+- $A$: [M, K] input matrix
+- $B$: [K, N] weight matrix
+- $D_0$: [N] bias vector (broadcasted)
+- $D_1$: [M, N] residual tensor
+- $E$: [M, N] output
+
+FastGELU is an efficient approximation of GELU:
+$$
+\text{FastGELU}(x) = x \cdot \sigma(1.702 \cdot x)
+$$
+where $\sigma$ is the sigmoid function.
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, bias and residual are added, and FastGELU is applied before writing to global memory.
+- No intermediate results are written to global memory.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
 ```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: time kernel (0=no, 1=yes)
-#arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
-./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1
+cd composable_kernel/example/04_gemm_add_add_fastgelu
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_add_add_fastgelu_xdl -M 2048 -N 8192 -K 2048 --verify=1 --time=1
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/04_gemm_add_add_fastgelu/
+├── gemm_add_add_fastgelu_xdl.cpp         # Main example: sets up, runs, and verifies GEMM+Add+Add+FastGELU
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_multiple_d.hpp         # Device-level API for multi-tensor GEMM
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_xdl_cshuffle_v3.hpp    # XDL with C-Shuffle epilogue
+│   └── device_gemm_fastgelu_impl.hpp      # FastGELU-specific implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_multiple_d_xdl.hpp   # Grid-level multi-stage GEMM
+include/ck/tensor_operation/gpu/element/
+    └── element_wise_operation.hpp         # Elementwise operation definitions
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmMultipleD** (in `device_gemm_multiple_d.hpp`):  
+  Device API for GEMM with multiple auxiliary tensors and fused epilogues.
+- **gridwise_gemm_multiple_d_xdl** (in `gridwise_gemm_multiple_d_xdl.hpp`):  
+  Implements the tiled/blocking GEMM kernel with multi-stage epilogue.
+- **element_wise_operation** (in `element_wise_operation.hpp`):  
+  Defines FastGELU and other elementwise operations.
+
+This example demonstrates how Composable Kernel supports complex multi-stage epilogue fusion for advanced neural network architectures.
--- a/example/04_gemm_add_add_fastgelu/common.hpp
+++ b/example/04_gemm_add_add_fastgelu/common.hpp
@@ -23,6 +23,10 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -31,7 +31,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               2>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
@@ -6,18 +6,21 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
 #endif
    using namespace ck::literals;
+    using Bypass = ck::tensor_layout::BypassLayoutVerification;

-    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+    ProblemSize ps =
+        problem_size; // make mutable copy because default stride values of 0 need to be updated
+    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;

    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
            }
        };

@@ -41,6 +44,30 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;

+    // If any user-provided leading stride < 0, replace it with the one determined by the
+    // created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1.
+    auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int {
+        if constexpr(std::is_same_v<decltype(layout_tag), ck::tensor_layout::gemm::RowMajor>)
+        {
+            return static_cast<int>(tensor.GetStrides()[0]);
+        }
+        else
+        {
+            return static_cast<int>(tensor.GetStrides()[1]);
+        }
+    };
+
+    if(StrideA < 0)
+        StrideA = fetch_leading_stride(a_m_k, ALayout{});
+    if(StrideB < 0)
+        StrideB = fetch_leading_stride(b_k_n, BLayout{});
+    if(StrideD0 < 0)
+        StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{});
+    if(StrideD1 < 0)
+        StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{});
+    if(StrideE < 0)
+        StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{});
+
    switch(config.init_method)
    {
    case 0: break;
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -19,4 +19,13 @@ foreach(gpu IN LISTS GPU_TARGETS)
        add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
        set(target 1)
    endif()
-endforeach()
+endforeach()
+
+list(APPEND gpu_list_tf32 gfx942 gfx950)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
+        add_example_executable(example_convnd_fwd_xdl_fp32_tf32 convnd_fwd_xdl_fp32_tf32.cpp)
+        set(target 1)
+    endif()
+endforeach()
--- a/example/09_convnd_fwd/README.md
+++ b/example/09_convnd_fwd/README.md
@@ -1,6 +1,42 @@
-# Instructions for ```example_convnd_fwd_xdl```
+# N-Dimensional Convolution Forward
+
+## Theory
+
+This example demonstrates the **N-dimensional convolution forward pass** using Composable Kernel. Convolution is a fundamental operation in deep learning, especially in convolutional neural networks (CNNs) for images, audio, and volumetric data.
+
+**Mathematical Formulation:**
+Given:
+- Input tensor: $X[N, C_{in}, D_1, D_2, ..., D_n]$
+- Weight tensor: $W[C_{out}, C_{in}, K_1, K_2, ..., K_n]$
+- Output tensor: $Y[N, C_{out}, O_1, O_2, ..., O_n]$
+
+The convolution computes:
+$$
+Y[n, c_{out}, o_1, ..., o_n] = \sum_{c_{in}} \sum_{k_1} ... \sum_{k_n} X[n, c_{in}, o_1 + k_1, ..., o_n + k_n] \cdot W[c_{out}, c_{in}, k_1, ..., k_n]
+$$
+
+Stride, padding, and dilation parameters control the mapping between input and output indices.
+
+**Algorithmic Background:**
+- Composable Kernel implements convolution as an implicit GEMM (matrix multiplication) for efficiency.
+- The input and weight tensors are transformed into matrices, and the convolution is performed as a GEMM.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/09_convnd_fwd
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+```
+
+### Run ```example_convnd_fwd_xdl```

-## Run ```example_convnd_fwd_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -16,3 +52,29 @@
 # <right padding>, (ie RightPy, RightPx for 2D)
 ./bin/example_convnd_fwd_xdl 0 1 100
 ```
+## Source Code Structure
+
+### Directory Layout
+```
+example/09_convnd_fwd/
+├── convnd_fwd_xdl.cpp         # Main example: sets up, runs, and verifies N-D convolution
+include/ck/tensor_operation/gpu/device/
+│   └── device_convnd_fwd.hpp       # Device-level convolution API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_convnd_fwd_xdl.hpp   # XDL-based convolution implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_convnd_fwd_xdl.hpp # Grid-level convolution kernel
+include/ck/tensor_operation/gpu/block/
+    └── blockwise_convnd_fwd_xdl.hpp # Block-level convolution
+```
+
+### Key Classes and Functions
+
+- **DeviceConvNdFwd** (in `device_convnd_fwd.hpp`):  
+  Device API for N-dimensional convolution.
+- **gridwise_convnd_fwd_xdl** (in `gridwise_convnd_fwd_xdl.hpp`):  
+  Implements the tiled/blocking convolution kernel.
+- **blockwise_convnd_fwd_xdl** (in `blockwise_convnd_fwd_xdl.hpp`):  
+  Handles block-level computation and shared memory tiling.
+
+This example demonstrates how Composable Kernel implements efficient N-dimensional convolution using implicit GEMM, supporting a wide range of deep learning applications.
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 void print_helper_msg()
 {
    std::cout << "arg1: verification (0=no, 1=yes)\n"
@@ -27,10 +31,14 @@ void print_helper_msg()
              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
 }

-template <typename DataType>
+template <typename DataType, typename GemmType = DataType>
 inline __host__ __device__ constexpr double get_rtol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
+    {
+        return 5e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
    {
        return 1e-3;
    }
@@ -68,10 +76,14 @@ inline __host__ __device__ constexpr double get_rtol()
    }
 }

-template <typename DataType>
+template <typename DataType, typename GemmType = DataType>
 inline __host__ __device__ constexpr double get_atol()
 {
-    if constexpr(std::is_same_v<DataType, float>)
+    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, float>)
    {
        return 1e-3;
    }
@@ -116,7 +128,8 @@ template <ck::index_t NDimSpatial,
          typename InElementOp,
          typename WeiElementOp,
          typename OutElementOp,
-          typename DeviceConvNDFwdInstance>
+          typename DeviceConvNDFwdInstance,
+          typename ComputeDataType = OutDataType>
 bool run_grouped_conv_fwd(bool do_verification,
                          int init_method,
                          bool time_kernel,
@@ -228,7 +241,11 @@ bool run_grouped_conv_fwd(bool do_verification,
                                                                     OutDataType,
                                                                     InElementOp,
                                                                     WeiElementOp,
-                                                                     OutElementOp>();
+                                                                     OutElementOp,
+                                                                     0,
+                                                                     0,
+                                                                     0,
+                                                                     ComputeDataType>();

        auto ref_invoker  = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(in,
@@ -249,8 +266,8 @@ bool run_grouped_conv_fwd(bool do_verification,
        return ck::utils::check_err(out_device,
                                    out_host,
                                    "Error: incorrect results!",
-                                    get_rtol<OutDataType>(),
-                                    get_atol<OutDataType>());
+                                    get_rtol<OutDataType, ComputeDataType>(),
+                                    get_atol<OutDataType, ComputeDataType>());
    }

    return true;
--- a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 void print_helper_msg()
 {
    std::cout << "arg1: verification (0=no, 1=yes)\n"
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -72,7 +72,7 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8>;
+        4>;

 #include "run_convnd_fwd_example.inc"

--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,9 +73,17 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        ComputeType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -53,10 +53,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -74,10 +74,18 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        AComputeType,
        BComputeType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -72,7 +72,7 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8>;
+        4>;

 #include "run_convnd_fwd_example.inc"

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,9 +73,17 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        ComputeType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // fp8 are not supported on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -76,4 +76,11 @@ using DeviceGroupedConvNDFwdInstance =

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32_tf32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32_tf32.cpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
+using InDataType       = float;
+using WeiDataType      = float;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using OutDataType      = float;
+using ComputeDataType  = ck::tf32_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        ck::Tuple<>,      // DsLayout
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        ck::Tuple<>,      // DsDataType
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        1,                // NumGemmKPrefetchStage
+        256,              // BlockSize
+        128,              // MPerBlock
+        192,              // NPerBlock
+        16,               // KPerBlock
+        4,                // AK1
+        4,                // BK1
+        32,               // MPerXdl
+        32,               // NPerXdl
+        2,                // MXdlPerWave
+        3,                // NXdlPerWave
+        S<4, 64, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        4,                // ABlockTransferSrcScalarPerVector
+        4,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 64, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        4,                // BBlockTransferSrcScalarPerVector
+        4,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMXdlPerWavePerShuffle
+        1,                // CShuffleNXdlPerWavePerShuffle
+        S<1, 16, 1, 16>,  // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        4,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ComputeDataType,  // AComputeDataType
+        ComputeDataType,  // BComputeDataType
+        ck::LoopScheduler::Default, // LoopScheduler
+        1                           // NumGroupsToMerge
+        >;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -7,6 +7,8 @@

 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
 using InDataType       = ck::f8_t;
 using WeiDataType      = ck::f8_t;
 using AccDataType      = float;
@@ -52,10 +54,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -73,9 +75,19 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        ComputeDataType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -53,10 +53,10 @@ using DeviceGroupedConvNDFwdInstance =
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -74,10 +74,18 @@ using DeviceGroupedConvNDFwdInstance =
        1,
        1,
        S<1, 32, 1, 8>,
-        8,
+        4,
        AComputeType,
        BComputeType>;

 #include "run_convnd_fwd_example.inc"

-int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
+int main(int argc, char* argv[])
+{
+    // temp disable on gfx11
+    if(ck::is_gfx11_supported())
+    {
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_common.hpp"

@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
        64,          // KPerBlock
        16,          // AK1
        16,          // BK1
-        32,          // MPerXdl
-        32,          // NPerXdl
-        2,           // MXdlPerWave
-        4,           // NXdlPerWave
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        8,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
@@ -71,8 +71,8 @@ using DeviceGroupedConvNDFwdInstance =
        1,           // BBlockLdsExtraN
        1,
        1,
-        S<1, 64, 1, 4>,
-        16>;
+        S<1, 32, 1, 8>,
+        4>;

 #include "run_convnd_fwd_example.inc"

--- a/example/09_convnd_fwd/run_convnd_fwd_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_example.inc
@@ -3,6 +3,11 @@

 #pragma once

+// use macro to minimize code change
+#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
+using ComputeDataType = AccDataType;
+#endif
+
 bool run_convnd_fwd_example(int argc, char* argv[])
 {
    print_helper_msg();
@@ -65,17 +70,17 @@ bool run_convnd_fwd_example(int argc, char* argv[])
            InElementOp,
            WeiElementOp,
            OutElementOp,
-            DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>>(
-            do_verification,
-            init_method,
-            time_kernel,
-            conv_param,
-            in_g_n_c_wis_desc,
-            wei_g_k_c_xs_desc,
-            out_g_n_k_wos_desc,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
+            DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>,
+            ComputeDataType>(do_verification,
+                             init_method,
+                             time_kernel,
+                             conv_param,
+                             in_g_n_c_wis_desc,
+                             wei_g_k_c_xs_desc,
+                             out_g_n_k_wos_desc,
+                             in_element_op,
+                             wei_element_op,
+                             out_element_op);
    };

    namespace ctc = ck::tensor_layout::convolution;
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/README.md
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/README.md
@@ -0,0 +1,57 @@
+# N-Dimensional Convolution with Multiple D and Multiple Reduce
+
+## Theory
+
+This example demonstrates **N-dimensional convolution forward** with support for multiple auxiliary tensors (D) and multiple reduction operations. This is useful for advanced neural network layers that require additional outputs or statistics alongside the main convolution result.
+
+**Mathematical Formulation:**
+- Input tensor: $X[N, C_{in}, D_1, D_2, ..., D_n]$
+- Weight tensor: $W[C_{out}, C_{in}, K_1, K_2, ..., K_n]$
+- Auxiliary tensors: $D_0, D_1, ...$ (various shapes)
+- Output tensor: $Y[N, C_{out}, O_1, O_2, ..., O_n]$
+- Reduction operations: e.g., sum, mean, max over specified axes
+
+The convolution computes the standard output as well as additional outputs or statistics by applying reduction operations to the convolution result or auxiliary tensors.
+
+**Algorithmic Background:**
+- Composable Kernel implements this as an implicit GEMM with support for multiple auxiliary tensors and reductions in the epilogue.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./convnd_fwd_multiple_d_multiple_reduce_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/10_convnd_fwd_multiple_d_multiple_reduce/
+├── convnd_fwd_multiple_d_multiple_reduce_xdl.cpp   # Main example: sets up, runs, and verifies N-D convolution with multiple D/reduce
+include/ck/tensor_operation/gpu/device/
+│   └── device_convnd_fwd_multiple_d_multiple_reduce.hpp   # Device-level API for multi-D/multi-reduce convolution
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_convnd_fwd_multiple_d_multiple_reduce_impl.hpp # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_convnd_fwd_multiple_d_multiple_reduce.hpp # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceConvNdFwdMultipleDMultipleReduce** (in `device_convnd_fwd_multiple_d_multiple_reduce.hpp`):  
+  Device API for N-dimensional convolution with multiple outputs and reductions.
+- **gridwise_convnd_fwd_multiple_d_multiple_reduce** (in `gridwise_convnd_fwd_multiple_d_multiple_reduce.hpp`):  
+  Implements the tiled/blocking convolution kernel with multi-output/reduce epilogue.
+
+This example demonstrates how Composable Kernel supports advanced convolution patterns with multiple outputs and reductions in a single efficient kernel.
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
@@ -26,6 +26,10 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using BF16 = ck::bhalf_t;
 using FP16 = ck::half_t;
 using FP32 = float;
@@ -125,7 +129,7 @@ inline bool parse_cmd_args(int argc,

        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
        problem_size                      = ck::utils::conv::parse_conv_param(
-            num_dim_spatial, threshold_to_catch_partial_args, argv);
+            num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
    }
    else
    {
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -15,4 +15,11 @@ using RsDataType        = ck::Tuple<R0DataType>;

 #include "run_convnd_fwd_max_example.inc"

-int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return !run_convnd_fwd_max_example(argc, argv);
+}
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
@@ -23,7 +23,7 @@ using RsGlobalReduceOp =
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;

-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;

 // clang-format off
 template <ck::index_t NDimSpatial>
@@ -36,7 +36,7 @@ using DeviceInstance =
 #ifdef BUILD_INT4_EXAMPLE
        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
 #else
-        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>,       ADataType,       BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>,       ADataType,       BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 #endif

 template <ck::index_t NDimSpatial>
--- a/example/11_convnd_fwd_bias/README.md
+++ b/example/11_convnd_fwd_bias/README.md
@@ -0,0 +1,57 @@
+# N-Dimensional Convolution Forward with Bias
+
+## Theory
+
+This example demonstrates **N-dimensional convolution forward** with bias addition. This is a common pattern in convolutional neural networks (CNNs), where a bias term is added to each output channel after the convolution operation.
+
+**Mathematical Formulation:**
+$$
+Y[n, c_{out}, o_1, ..., o_n] = \sum_{c_{in}} \sum_{k_1} ... \sum_{k_n} X[n, c_{in}, o_1 + k_1, ..., o_n + k_n] \cdot W[c_{out}, c_{in}, k_1, ..., k_n] + B[c_{out}]
+$$
+- $X$: [N, C_in, D1, D2, ..., Dn] input tensor
+- $W$: [C_out, C_in, K1, K2, ..., Kn] weight tensor
+- $B$: [C_out] bias tensor
+- $Y$: [N, C_out, O1, O2, ..., On] output tensor
+
+**Algorithmic Background:**
+- Composable Kernel implements convolution as an implicit GEMM, with bias addition fused in the epilogue for efficiency.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/11_convnd_fwd_bias
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./convnd_fwd_bias_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/11_convnd_fwd_bias/
+├── convnd_fwd_bias_xdl.cpp         # Main example: sets up, runs, and verifies N-D convolution with bias
+include/ck/tensor_operation/gpu/device/
+│   └── device_convnd_fwd_bias.hpp       # Device-level convolution API with bias
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_convnd_fwd_bias_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_convnd_fwd_bias.hpp     # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceConvNdFwdBias** (in `device_convnd_fwd_bias.hpp`):  
+  Device API for N-dimensional convolution with bias.
+- **gridwise_convnd_fwd_bias** (in `gridwise_convnd_fwd_bias.hpp`):  
+  Implements the tiled/blocking convolution kernel with bias epilogue.
+
+This example demonstrates how Composable Kernel fuses bias addition into the convolution forward pass for efficient CNN layer implementation.
--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
@@ -1,6 +1,38 @@
-# Instructions for ```example_reduce_blockwise```
+# Parallel Reduction Operations
+
+## Theory
+
+This example demonstrates **parallel reduction operations** (e.g., sum, max, min, mean) over tensors. Reduction is a fundamental operation in deep learning for computing statistics (such as batch mean/variance), loss aggregation, and normalization.
+
+**Mathematical Formulation:**
+Given a tensor $X$ and a reduction axis $a$:
+$$
+Y = \text{reduce}_{a}(X)
+$$
+- For sum: $Y = \sum_{i \in a} X_i$
+- For max: $Y = \max_{i \in a} X_i$
+- For mean: $Y = \frac{1}{|a|} \sum_{i \in a} X_i$
+
+**Algorithmic Background:**
+- Reductions are implemented using parallel tree reduction or segmented reduction algorithms.
+- Efficient reductions require careful memory access, synchronization, and sometimes numerically stable algorithms (e.g., Welford's for variance).
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/12_reduce
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+```

 ## Run ```example_reduce_blockwise```
+
 ```bash
 # -D <xxx> : input 3D/4D/5D tensor lengths
 # -R <xxx> : reduce dimension ids
@@ -11,7 +43,8 @@
 ./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
 ```

-Result
+Expected Result:
+
 ```
 ./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
 launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
@@ -21,6 +54,7 @@ Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
 ```

 ## Run ```example_reduce_multiblock_atomic_add```
+
 ```bash
 # -D <xxx> : input 3D/4D/5D tensor lengths
 # -R <xxx> : reduce dimension ids
@@ -31,7 +65,7 @@ Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
 ./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
 ```

-Result
+Expected Result
 ```
 ./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
 Perf: 0 ms, inf GB/s, DeviceReduceMultiBlock<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
@@ -42,6 +76,7 @@ echo $?
 # Instructions for ```example_reduce_blockwise_two_call```

 ## Run ```example_reduce_blockwise_two_call```
+
 ```bash
 #arg1:  verification (0=no, 1=yes(
 #arg2:  initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
@@ -49,7 +84,8 @@ echo $?
 ./bin/example_reduce_blockwise_two_call 1 2 1
 ```

-Result
+Expected Result:
+
 ```
 ./bin/example_reduce_blockwise_two_call 1 2 1
 launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
@@ -60,3 +96,30 @@ Warm up 1 time
 Start running 10 times...
 Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/12_reduce/
+├── reduce_xdl.cpp         # Main example: sets up, runs, and verifies reduction
+include/ck/tensor_operation/gpu/device/
+│   └── device_reduce.hpp       # Device-level reduction API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_reduce_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_reduce.hpp     # Grid-level reduction kernel
+include/ck/tensor_operation/gpu/block/
+    └── blockwise_reduce.hpp    # Block-level reduction
+```
+
+### Key Classes and Functions
+
+- **DeviceReduce** (in `device_reduce.hpp`):  
+  Device API for reductions.
+- **gridwise_reduce** (in `gridwise_reduce.hpp`):  
+  Implements the tiled/blocking reduction kernel.
+- **blockwise_reduce** (in `blockwise_reduce.hpp`):  
+  Handles block-level reduction and shared memory.
+
+This example demonstrates how Composable Kernel implements efficient parallel reductions for deep learning and scientific computing.
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -20,6 +20,10 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/host_common_util.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using namespace ck;
 using namespace ck::tensor_operation::device;

@@ -100,13 +104,13 @@ int main(int argc, char* argv[])
    const std::array<int, 2> reduceDims = {3, 4};
    // const std::array<int, 3> invariantDims = {0, 1, 2};

-    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
+    std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};

    // input lengths of the second reduction, which is also the output lengths of the first
    // reduction
-    const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
+    std::vector<size_t> inLengths_2 = {64, 320, 80, 4};

-    const std::vector<size_t> outLengths = {64, 320, 80};
+    std::vector<size_t> outLengths = {64, 320, 80};

    if(argc == 1)
    {
@@ -114,11 +118,26 @@ int main(int argc, char* argv[])
        init_method = 2;
        time_kernel = true;
    }
-    else if(argc == 4)
+    else if((argc == 4) || (argc == 9))
    {
        do_verify   = static_cast<bool>(argv[1]);
        init_method = atoi(argv[2]);
        time_kernel = static_cast<bool>(atoi(argv[3]));
+        if(argc == 9)
+        {
+            inLengths_1[0] = atoi(argv[4]);
+            inLengths_1[1] = atoi(argv[5]);
+            inLengths_1[2] = atoi(argv[6]);
+            inLengths_1[3] = atoi(argv[7]);
+            inLengths_1[4] = atoi(argv[8]);
+            inLengths_2[0] = inLengths_1[0];
+            inLengths_2[1] = inLengths_1[1];
+            inLengths_2[2] = inLengths_1[2];
+            inLengths_2[3] = inLengths_1[3];
+            outLengths[0]  = inLengths_1[0];
+            outLengths[1]  = inLengths_1[1];
+            outLengths[2]  = inLengths_1[2];
+        }
    }
    else
    {
--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
@@ -1,6 +1,41 @@
-# Instructions for ```example_pool2d_fwd``` Examples
+# 2D Pooling Forward
+
+## Theory
+
+This example demonstrates the **2D pooling forward pass**, a key operation in convolutional neural networks (CNNs) for spatial downsampling. Pooling reduces the spatial dimensions of feature maps, providing translation invariance and reducing computation.
+
+**Mathematical Formulation:**
+Given input $X[N, C, H_{in}, W_{in}]$, pooling window $(k_H, k_W)$, stride $(s_H, s_W)$, and padding $(p_H, p_W)$:
+- Output $Y[N, C, H_{out}, W_{out}]$
+- $H_{out} = \left\lfloor \frac{H_{in} + 2p_H - k_H}{s_H} \right\rfloor + 1$
+- $W_{out} = \left\lfloor \frac{W_{in} + 2p_W - k_W}{s_W} \right\rfloor + 1$
+
+For each output position:
+- **Max Pooling:** $Y_{n,c,h,w} = \max_{i,j} X_{n,c,h \cdot s_H + i, w \cdot s_W + j}$
+- **Average Pooling:** $Y_{n,c,h,w} = \frac{1}{k_H k_W} \sum_{i,j} X_{n,c,h \cdot s_H + i, w \cdot s_W + j}$
+
+**Algorithmic Background:**
+- Each thread computes one or more output elements.
+- Handles padding and boundary conditions.
+- Optimizes memory access for bandwidth.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/13_pool2d_fwd
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+```
+
+### Run ```example_pool2d_fwd_fp16```

-## Run ```example_pool2d_fwd_fp16```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
@@ -9,7 +44,7 @@
 ./bin/example_pool2d_fwd_fp16 1 1 1
 ```

-Result 
+Expected Result: 
 ```
 in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
 out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
@@ -19,7 +54,8 @@ Start running 10 times...
 Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
 ```

-## Run ```example_pool2d_fwd_fp32```
+### Run ```example_pool2d_fwd_fp32```
+
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
@@ -29,8 +65,9 @@ Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
 ```


-Result 
-```
+Expected Result: 
+
+```bash
 ./bin/example_pool2d_fwd_fp32 1 1 1
 in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
 out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
@@ -39,3 +76,31 @@ Warm up 1 time
 Start running 10 times...
 Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/13_pool2d_fwd/
+├── pool2d_fwd_xdl.cpp         # Main example: sets up, runs, and verifies 2D pooling
+include/ck/tensor_operation/gpu/device/
+│   └── device_pool_fwd.hpp       # Device-level pooling API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_pool2d_fwd_nhwc.hpp # NHWC layout optimization
+│   └── device_pool2d_fwd_nchw.hpp # NCHW layout optimization
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_pool_fwd.hpp     # Grid-level pooling kernel
+include/ck/tensor_operation/gpu/block/
+    └── blockwise_pool.hpp        # Block-level pooling
+```
+
+### Key Classes and Functions
+
+- **DevicePoolFwd** (in `device_pool_fwd.hpp`):  
+  Device API for pooling.
+- **gridwise_pool_fwd** (in `gridwise_pool_fwd.hpp`):  
+  Implements the tiled/blocking pooling kernel.
+- **blockwise_pool** (in `blockwise_pool.hpp`):  
+  Handles block-level pooling and shared memory.
+
+This example demonstrates how Composable Kernel implements efficient 2D pooling for CNNs and vision models.
--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <typename InDataType,
          typename OutDataType,
          typename ComputeDataType,
@@ -78,12 +82,12 @@ bool pool_test(bool do_verification,

            if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
            {
-                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz}, layout);
            }
            else if constexpr(ck::is_same<decltype(layout),
                                          ck::tensor_layout::convolution::NHWC>::value)
            {
-                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}, layout);
            }
        };

--- a/example/14_gemm_quantization/CMakeLists.txt
+++ b/example/14_gemm_quantization/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
+add_example_executable(example_gemm_wmma_quantization_int8 gemm_wmma_quantization_int8.cpp)
 add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
 add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
--- a/example/14_gemm_quantization/README.md
+++ b/example/14_gemm_quantization/README.md
@@ -0,0 +1,60 @@
+# GEMM with Quantization
+
+## Theory
+
+This example demonstrates **GEMM (General Matrix Multiplication) with quantized inputs or weights**. Quantization is a technique to reduce memory and computation by representing values with lower-precision integer types (e.g., int8), commonly used for efficient inference in deep learning.
+
+**Mathematical Formulation:**
+- Quantized GEMM: $C = \text{dequant}(A_q) \times \text{dequant}(B_q)$
+- $A_q$, $B_q$: quantized matrices (e.g., int8)
+- $\text{dequant}(x_q) = (x_q - z) \cdot s$ (scale $s$, zero-point $z$)
+- $C$: output matrix (often in higher precision, e.g., float32 or float16)
+
+**Algorithmic Background:**
+- Quantized values are dequantized on-the-fly during GEMM computation.
+- Accumulation is performed in higher precision for accuracy.
+- Supports symmetric and asymmetric quantization.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/14_gemm_quantization
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_quantization_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/14_gemm_quantization/
+├── gemm_quantization_xdl.cpp         # Main example: sets up, runs, and verifies quantized GEMM
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_quantized.hpp       # Device-level quantized GEMM API
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_quantized_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_gemm_quantized.hpp     # Grid-level quantized GEMM kernel
+include/ck/tensor_operation/gpu/element/
+    └── quantization_operations.hpp     # Quantization/dequantization utilities
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmQuantized** (in `device_gemm_quantized.hpp`):  
+  Device API for quantized GEMM.
+- **gridwise_gemm_quantized** (in `gridwise_gemm_quantized.hpp`):  
+  Implements the tiled/blocking quantized GEMM kernel.
+- **quantization_operations** (in `quantization_operations.hpp`):  
+  Defines quantization and dequantization functions.
+
+This example demonstrates how Composable Kernel supports efficient quantized matrix multiplication for deep learning inference.
--- a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -115,12 +119,14 @@ int main()
            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1_uz}));
+                                            std::vector<std::size_t>({stride, 1_uz}),
+                                            layout);
            }
            else
            {
                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1_uz, stride}));
+                                            std::vector<std::size_t>({1_uz, stride}),
+                                            layout);
            }
        };

--- a/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = I8;
+
+using ALayout  = Col;
+using BLayout  = Row;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffleV3<
+    ALayout,
+    BLayout,
+    DsLayout,
+    ELayout,
+    ADataType,
+    BDataType,
+    DsDataType,
+    EDataType,
+    AccDataType,
+    CShuffleDataType,
+    ActivationOp,
+    ActivationOp,
+    CDEElementOp,
+    GemmDefault,
+    256,
+    128,
+    128,
+    64,
+    8,
+    8,
+    16,
+    16,
+    4,
+    2,
+    S<4, 64, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    true,
+    S<4, 64, 1>,
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    1,
+    8,
+    true,
+    1,
+    1,
+    S<1, 32, 1, 8>,
+    S<1>,
+    ck::BlockGemmPipelineScheduler::Intrawave,
+    ck::BlockGemmPipelineVersion::v1,
+    I8,
+    I8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
+
+int main(int /* argc */, char* /* argv */[])
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideE = N;
+
+    float requant_scale = 0.03;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op   = PassThrough{};
+    auto b_element_op   = PassThrough{};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    // device GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                      std::array<const void*, 0>{},
+                                      static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      std::array<ck::index_t, 0>{},
+                                      StrideE,
+                                      1,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -70,10 +74,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     64,                         // KPerBlock,
     16,                         // AK1,
     16,                         // BK1,
-     32,                         // MPerXDL,
-     32,                         // NPerXDL,
-     4,                          // MXdlPerWave,
-     2,                          // NXdlPerWave,
+     16,                         // MPerXDL,
+     16,                         // NPerXDL,
+     8,                          // MXdlPerWave,
+     4,                          // NXdlPerWave,
     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
@@ -90,8 +94,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     1,                          // bool BBlockLdsExtraN,
     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
-     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     4>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -68,10 +72,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     64,                         // KPerBlock,
     16,                         // AK1,
     16,                         // BK1,
-     32,                         // MPerXDL,
-     32,                         // NPerXDL,
-     4,                          // MXdlPerWave,
-     2,                          // NXdlPerWave,
+     16,                         // MPerXDL,
+     16,                         // NPerXDL,
+     8,                          // MXdlPerWave,
+     4,                          // NXdlPerWave,
     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
@@ -88,8 +92,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     1,                          // bool BBlockLdsExtraN,
     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
-     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     4>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::
--- a/example/15_grouped_gemm/CMakeLists.txt
+++ b/example/15_grouped_gemm/CMakeLists.txt
@@ -33,3 +33,13 @@ if(USE_BITINT_EXTENSION_INT4)
    add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)
    add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4)
 endif()
+
+list(APPEND gpu_list_tf32 gfx942 gfx950)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
+        add_example_executable(example_grouped_gemm_xdl_fp32_tf32 grouped_gemm_xdl_fp32_tf32.cpp)
+        add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32_tf32)
+        set(target 1)
+    endif()
+endforeach()
--- a/example/15_grouped_gemm/README.md
+++ b/example/15_grouped_gemm/README.md
@@ -1,9 +1,64 @@
-# Instructions for ```example_grouped_gemm_xdl```
+# Grouped GEMM
+
+## Theory
+
+This example demonstrates **grouped GEMM**: performing multiple independent GEMM operations (with potentially different shapes) in a single kernel launch. Grouped GEMM is used in transformer models (e.g., multi-head attention), mixture-of-experts, and other architectures requiring heterogeneous batched matrix multiplications.
+
+**Mathematical Formulation:**
+For $G$ groups, each with its own $A_g$, $B_g$, $C_g$:
+$$
+C_g = A_g \times B_g \quad \text{for} \quad g = 1, 2, ..., G
+$$
+- $A_g$: [M_g, K_g] input matrix for group $g$
+- $B_g$: [K_g, N_g] weight matrix for group $g$
+- $C_g$: [M_g, N_g] output matrix for group $g$
+
+**Algorithmic Background:**
+- Each group can have different matrix sizes and strides.
+- The kernel launches a grid covering all groups, with each block assigned to a group.
+- Useful for variable-length sequences, multi-head attention, and expert routing.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/15_grouped_gemm
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+```
+
+### Run ```example_grouped_gemm_xdl```

-## Run ```example_grouped_gemm_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 ./bin/example_grouped_gemm_xdl_fp16 0 1 5
 ```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/15_grouped_gemm/
+├── grouped_gemm_xdl.cpp         # Main example: sets up, runs, and verifies grouped GEMM
+include/ck/tensor_operation/gpu/device/
+│   └── device_grouped_gemm_xdl.hpp       # Device-level grouped GEMM API
+include/ck/tensor_operation/gpu/grid/
+│   └── gridwise_grouped_gemm_xdl.hpp     # Grid-level grouped GEMM kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceGroupedGemmXdl** (in `device_grouped_gemm_xdl.hpp`):  
+  Device API for grouped GEMM.
+- **gridwise_grouped_gemm_xdl** (in `gridwise_grouped_gemm_xdl.hpp`):  
+  Implements the tiled/blocking grouped GEMM kernel.
+
+This example demonstrates how Composable Kernel supports efficient heterogeneous batched matrix multiplication for advanced AI/ML workloads.
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
@@ -25,6 +25,11 @@
 #include "ck/utility/tuple.hpp"
 #include "ck/utility/sequence.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
@@ -23,6 +23,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -23,6 +23,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -63,7 +68,7 @@ using DeviceGemmInstance =
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,    S<4, 64, 1>,     S<1, 0, 2>,      S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,       S<4,4,4>>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   16,   16,    2,    4,    S<4, 64, 1>,     S<1, 0, 2>,      S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,       S<4,4,4>>;
 // clang-format on

 struct ProblemSize final
--- a/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -54,7 +59,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on

 #include "run_grouped_gemm_example.inc"
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
@@ -20,6 +20,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -323,6 +328,31 @@ int main(int argc, char* argv[])

    problem_size.Ms = {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};

+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.k_batch           = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (>0)\n");
+        printf("arg5: group count (default=16)");
+        exit(0);
+    }
+
    for(int i = 0; i < problem_size.group_count; i++)
    {
        problem_size.Ns.push_back(768);
@@ -333,21 +363,5 @@ int main(int argc, char* argv[])
        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
    }

-    if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.k_batch         = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: k_batch (>0)\n");
-        exit(0);
-    }
-
    return !run_grouped_gemm(problem_size, config);
 }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
@@ -20,6 +20,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -296,6 +301,32 @@ int main(int argc, char* argv[])

    problem_size.group_count = 16;

+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.k_batch           = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (> 0)\n");
+        printf("arg5: group count (default=16)");
+
+        exit(0);
+    }
+
    for(int i = 0; i < problem_size.group_count; i++)
    {
        problem_size.Ms.push_back(128 + rand() % 128);
@@ -307,21 +338,5 @@ int main(int argc, char* argv[])
        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
    }

-    if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.k_batch         = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: k_batch (> 0)\n");
-        exit(0);
-    }
-
    return !run_grouped_gemm(problem_size, config);
 }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
@@ -20,6 +20,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -297,6 +302,31 @@ int main(int argc, char* argv[])

    problem_size.group_count = 16;

+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.k_batch           = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (> 0)\n");
+        printf("arg5: group count (default=16)");
+        exit(0);
+    }
+
    for(int i = 0; i < problem_size.group_count; i++)
    {
        problem_size.Ms.push_back(256 + 256 * i);
@@ -308,21 +338,5 @@ int main(int argc, char* argv[])
        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
    }

-    if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.k_batch         = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: k_batch (> 0)\n");
-        exit(0);
-    }
-
    return !run_grouped_gemm(problem_size, config);
 }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -54,7 +59,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
 // clang-format on

 #include "run_grouped_gemm_example.inc"
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -54,9 +59,16 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               2>;
 // clang-format on

 #include "run_grouped_gemm_example.inc"

-int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+    return !run_grouped_gemm_example(argc, argv);
+}
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+using ComputeDataType  = ck::tf32_t;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4, ck::LoopScheduler::Default, ComputeDataType>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
--- a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -51,7 +56,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 8>,             4>;
 // clang-format on

 #include "run_grouped_gemm_example.inc"
--- a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -66,6 +71,28 @@ int main(int argc, char* argv[])

    problem_size.group_count = 16;

+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 5)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        problem_size.group_count = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: group count (default=16)");
+        exit(0);
+    }
+
    for(int i = 0; i < problem_size.group_count; i++)
    {
        problem_size.Ms.push_back(256 + 256 * i);
@@ -77,19 +104,5 @@ int main(int argc, char* argv[])
        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
    }

-    if(argc == 4)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        exit(0);
-    }
-
    return !run_grouped_gemm(problem_size, config);
 }
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -3,6 +3,11 @@

 #pragma once

+// use macro to minimize code change
+#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
+using ComputeDataType = AccDataType;
+#endif
+
 struct ProblemSize final
 {
    std::vector<ck::index_t> Ms;
@@ -231,7 +236,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
                                                                                AccDataType,
                                                                                AElementOp,
                                                                                BElementOp,
-                                                                                CDEElementOp>;
+                                                                                CDEElementOp,
+                                                                                ComputeDataType>;

        for(std::size_t i = 0; i < gemm_descs.size(); i++)
        {
@@ -253,7 +259,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            pass &= ck::utils::check_err(c_device_result_converted, c_host_tensors[i]);

 #else
-            pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
+            pass &= ck::utils::check_err<decltype(c_device_tensors[i]),
+                                         decltype(c_host_tensors[i]),
+                                         ComputeDataType>(c_device_tensors[i], c_host_tensors[i]);
 #endif
        }
    }
@@ -278,28 +286,20 @@ bool run_grouped_gemm_example(int argc, char* argv[])

    problem_size.group_count = 16;

-    for(int i = 0; i < problem_size.group_count; i++)
+    if(argc == 1)
    {
-        problem_size.Ms.push_back(256 + 256 * i);
-        problem_size.Ns.push_back(128 + 128 * i);
-        problem_size.Ks.push_back(128 + 64 * i);
-
-        problem_size.stride_As.push_back(problem_size.Ks[i]);
-        problem_size.stride_Bs.push_back(problem_size.Ks[i]);
-        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
+        // use default cases
    }
-    if(argc == 4)
+    else if(argc == 4 || argc == 6)
    {
        config.do_verification = std::stoi(argv[1]);
        config.init_method     = std::stoi(argv[2]);
        config.time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.async_hargs     = std::stoi(argv[4]);
+        if(argc == 6)
+        {
+            config.async_hargs       = std::stoi(argv[4]);
+            problem_size.group_count = std::stoi(argv[5]);
+        }
    }
    else
    {
@@ -307,7 +307,34 @@ bool run_grouped_gemm_example(int argc, char* argv[])
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=n0, 1=yes)\n");
        printf("arg4: async hargs (0=n0, 1=yes)\n");
-        exit(0);
+        printf("arg5: group count (default=16)");
+        exit(1);
+    }
+
+    // Lambda to get stride based on layout
+    auto get_stride = [](auto layout, auto row_dim, auto col_dim) {
+        if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+        {
+            return col_dim;
+        }
+        else
+        {
+            return row_dim;
+        }
+    };
+
+    for(int i = 0; i < problem_size.group_count; i++)
+    {
+        problem_size.Ms.push_back(256 + 256 * i);
+        problem_size.Ns.push_back(128 + 128 * i);
+        problem_size.Ks.push_back(128 + 64 * i);
+
+        problem_size.stride_As.push_back(
+            get_stride(ALayout{}, problem_size.Ms[i], problem_size.Ks[i]));
+        problem_size.stride_Bs.push_back(
+            get_stride(BLayout{}, problem_size.Ks[i], problem_size.Ns[i]));
+        problem_size.stride_Cs.push_back(
+            get_stride(ELayout{}, problem_size.Ms[i], problem_size.Ns[i]));
    }

    return run_grouped_gemm(problem_size, config);
--- a/example/16_gemm_multi_d_multi_reduces/README.md
+++ b/example/16_gemm_multi_d_multi_reduces/README.md
@@ -0,0 +1,56 @@
+# GEMM with Multiple D and Multiple Reductions
+
+## Theory
+
+This example demonstrates **GEMM with multiple auxiliary tensors (D) and multiple reduction operations**. This pattern is used in advanced neural network layers that require additional outputs or statistics (such as sums, means, or other reductions) alongside the main GEMM result.
+
+**Mathematical Formulation:**
+- For each GEMM: $C = A \times B$
+- Auxiliary tensors: $D_0, D_1, ...$ (various shapes)
+- Reductions: e.g., sum, mean, max over specified axes or outputs
+
+The kernel computes the main GEMM output and additional reductions or statistics in a single pass.
+
+**Algorithmic Background:**
+- The GEMM result is kept in registers, auxiliary tensors are fused in the epilogue, and reductions are computed as part of the output.
+- Useful for multi-task learning, attention statistics, and custom neural network layers.
+
+## How to Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build and run
+```bash
+cd composable_kernel/example/16_gemm_multi_d_multi_reduces
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
+make -j
+
+# Example run
+./gemm_multi_d_multi_reduces_xdl --verify=1 --time=1
+```
+
+## Source Code Structure
+
+### Directory Layout
+```
+example/16_gemm_multi_d_multi_reduces/
+├── gemm_multi_d_multi_reduces_xdl.cpp         # Main example: sets up, runs, and verifies GEMM with multi-D/multi-reduce
+include/ck/tensor_operation/gpu/device/
+│   └── device_gemm_multi_d_multi_reduces.hpp       # Device-level API for multi-D/multi-reduce GEMM
+include/ck/tensor_operation/gpu/device/impl/
+│   └── device_gemm_multi_d_multi_reduces_impl.hpp  # Implementation
+include/ck/tensor_operation/gpu/grid/
+    └── gridwise_gemm_multi_d_multi_reduces.hpp     # Grid-level kernel
+```
+
+### Key Classes and Functions
+
+- **DeviceGemmMultiDMultiReduces** (in `device_gemm_multi_d_multi_reduces.hpp`):  
+  Device API for GEMM with multiple outputs and reductions.
+- **gridwise_gemm_multi_d_multi_reduces** (in `gridwise_gemm_multi_d_multi_reduces.hpp`):  
+  Implements the tiled/blocking GEMM kernel with multi-output/reduce epilogue.
+
+This example demonstrates how Composable Kernel supports advanced GEMM patterns with multiple outputs and reductions in a single efficient kernel.
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -76,7 +80,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
 //######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
 //######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,                    4,                  1>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -8,6 +8,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = INT8;
 using BDataType         = INT8;
@@ -72,10 +76,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         64,                        // KPerBlock
         16,                        // AK1
         16,                        // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -92,7 +96,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = BF16;
 using BDataType         = BF16;
@@ -65,10 +69,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         32,                        // KPerBlock
         8,                         // AK1
         8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -85,7 +89,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = F16;
 using BDataType         = F16;
@@ -65,10 +69,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         32,                        // KPerBlock
         8,                         // AK1
         8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -85,7 +89,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = F32;
 using BDataType         = F32;
@@ -146,6 +150,11 @@ int main(int argc, char* argv[])
        exit(0);
    }

+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        return 0;
+    }
+
    return run_gemm_reduce_max_xdl<ADataType,
                                   BDataType,
                                   EDataType,
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using ADataType         = INT4;
 using ADataKernelType   = INT8;
 using BDataType         = INT4;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using ADataType         = INT8;
 using BDataType         = INT8;
 using GemmAccDataType   = INT32;
@@ -64,10 +68,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         64,                        // KPerBlock
         16,                        // AK1
         16,                        // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -84,7 +88,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = BF16;
 using BDataType         = BF16;
@@ -72,10 +76,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         32,                        // KPerBlock
         8,                         // AK1
         8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -92,7 +96,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = F16;
 using BDataType         = F16;
@@ -72,10 +76,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         32,                        // KPerBlock
         8,                         // AK1
         8,                         // BK1
-         32,                        // MPerXdl
-         32,                        // NPerXdl
-         4,                         // MXdlPerWave
-         2,                         // NXdlPerWave
+         16,                        // MPerXdl
+         16,                        // NPerXdl
+         8,                         // MXdlPerWave
+         4,                         // NXdlPerWave
         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
@@ -92,7 +96,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
         1,                         // BBlockLdsExtraN
         1,                         // CShuffleMXdlPerWavePerShuffle
         1,                         // CShuffleNXdlPerWavePerShuffle
-         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         S<32, 8>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
         1>;                        // RThread DstScalarPerVector _MPerBlock
 // clang-format on
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "gemm_reduce_xdl_common.hpp"

@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = F32;
 using BDataType         = F32;
@@ -153,6 +157,11 @@ int main(int argc, char* argv[])
        exit(EXIT_SUCCESS);
    }

+    if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+    {
+        exit(EXIT_SUCCESS);
+    }
+
    return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
                                                BDataType,
                                                EDataType,
--- a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
@@ -18,6 +18,10 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/17_convnd_bwd_data/README.md
+++ b/example/17_convnd_bwd_data/README.md
@@ -1,6 +1,62 @@
-# Instructions for ```example_convnd_bwd_data_xdl```
+# N-Dimensional Convolution Backward Pass for Data

-## Run ```example_example_convnd_bwd_data_xdl```
+This example demonstrates the backward data pass of an N-dimensional convolution, often denoted as `conv_bwd_data`. This operation is a crucial part of the backpropagation algorithm for training Convolutional Neural Networks (CNNs). Its purpose is to compute the gradient of the loss function with respect to the convolution's *input data*, which is then passed back to the preceding layer in the network.
+
+## Mathematical Formulation
+
+The backward data pass computes the gradient $\frac{\partial L}{\partial \text{In}}$, given the gradient from the subsequent layer, $\frac{\partial L}{\partial \text{Out}}$, and the filter weights `W` used in the forward pass.
+
+Let the forward convolution be defined as:
+$\text{Out} = \text{In} \star W$
+
+The backward data pass is mathematically equivalent to a "full" convolution between the output gradient tensor `dL/dOut` and the 180-degree rotated (or transposed and flipped) weight tensor `W`.
+
+$\frac{\partial L}{\partial \text{In}} = \frac{\partial L}{\partial \text{Out}} \star \text{rot180}(W)$
+
+This operation propagates the error signal from the output back to the input, weighted by the same filters that were used in the forward pass.
+
+## Algorithmic Strategy: Implicit GEMM
+
+As with the forward pass, the most efficient way to implement the backward data pass on a GPU is to transform the convolution into a General Matrix-Matrix Multiplication (GEMM) problem.
+
+1.  **Output Gradient Reshaping**: The output gradient tensor `dL/dOut` is logically reshaped into a matrix `dL/dOut'` of shape `[K, (N*Ho*Wo)]`. This becomes the "A" matrix in the GEMM.
+
+2.  **Weight Reshaping**: The weight tensor `W` is logically reshaped into a matrix `W'` of shape `[K, (C*Y*X)]`. This becomes the "B" matrix in the GEMM.
+
+3.  **Implicit GEMM**: The core computation is then formulated as a GEMM operation. However, the output of this GEMM is not a simple matrix; it's the `dL/dIn` tensor.
+    $(\text{dL/dIn})' = (W')^T \times (\text{dL/dOut})'$
+
+    The key insight is that this operation can be performed without explicitly forming the matrices. The GEMM kernel is designed to read from `dL/dOut` and `W` and write its results directly to the appropriate locations in the `dL/dIn` tensor. This process is sometimes referred to as an "implicit `col2im`" (column-to-image), as it is the inverse of the `im2col` transformation used in the forward pass.
+
+This "implicit GEMM" approach is highly efficient. It avoids the massive memory and bandwidth overhead of materializing intermediate matrices, which is critical for performance.
+
+## Source Code Organization
+
+-   [`conv_bwd_data_xdl.cpp`](./conv_bwd_data_xdl.cpp): The main example file that defines the parameters for a 2D convolution and instantiates the generic `DeviceConvNdBwdData` kernel to compute the input gradients.
+-   [`../../include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp`](../../include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp): The high-level device interface for the backward data convolution. It is templated on the dimensionality, layouts, and data types of the problem.
+-   [`../../include/ck/tensor_operation/gpu/grid/gridwise_gemm_implicit_gemm_v1r2_xdlops_nchw_kcyx_nkhw.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_gemm_implicit_gemm_v1r2_xdlops_nchw_kcyx_nkhw.hpp): An example of a specific grid-wise kernel that implements the implicit GEMM algorithm for the backward data pass. The library contains multiple such kernels optimized for different layouts and problem types, and the `DeviceConvNdBwdData` interface selects the most appropriate one.
+-   [`../../library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp`](../../library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp): A CPU reference implementation used to verify the correctness of the GPU kernel's output.
+
+## Build and Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/17_convnd_bwd_data
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -45,3 +101,16 @@ Warm up
 Start running 1 times...
 Perf: 1.40031 ms, 69.8734 TFlops, 179.037 GB/s
 ```
+
+## Relationship to Other Passes
+
+The training of a single convolutional layer requires three distinct steps:
+
+1.  **Forward Pass (`conv_fwd`)**: Computes the output feature maps.
+    -   `Out = In * W`
+2.  **Backward Data Pass (`conv_bwd_data`)**: Computes the gradient with respect to the input, propagating the error to the previous layer. This is the focus of the current example.
+    -   `dL/dIn = dL/dOut * rot180(W)`
+3.  **Backward Weight Pass (`conv_bwd_weight`)**: Computes the gradient with respect to the weights, which is needed for the weight update.
+    -   `dL/dW = In * dL/dOut`
+
+All three passes are critical for training a CNN, and all are typically implemented as high-performance implicit GEMM operations.
--- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
@@ -18,6 +18,10 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 void print_helper_msg()
 {
    std::cout << "arg1: verification (0=no, 1=yes)\n"
--- a/example/18_batched_gemm_reduce/CMakeLists.txt
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
@@ -1,8 +1 @@
-list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
-set(target 0)
-foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
-   set(target 1)
- endif()
-endforeach()
+add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
--- a/example/18_batched_gemm_reduce/README.md
+++ b/example/18_batched_gemm_reduce/README.md
@@ -0,0 +1,78 @@
+# Batched GEMM with Reduction
+
+This example demonstrates a Batched General Matrix-Matrix Multiplication (Batched GEMM) where the result of each individual GEMM in the batch is then reduced along one of its dimensions. This is a specialized fusion pattern that combines a compute-intensive operation (GEMM) with a memory-intensive one (reduction), offering significant performance benefits for specific workloads.
+
+## Mathematical Formulation
+
+The operation performs a standard GEMM for each item in a batch, and then reduces the resulting matrix to a vector. For each batch item `b` from `0` to `BatchCount-1`:
+
+1.  **GEMM Stage**: A standard matrix multiplication is performed.
+    $C_{[b]} = A_{[b]} \times B_{[b]}$
+
+2.  **Reduction Stage**: The resulting matrix $C_{[b]}$ is reduced along one of its dimensions (e.g., the M dimension) to produce an output vector $D_{[b]}$.
+    $D_{[b], j} = \bigoplus_{i=0}^{M-1} C_{[b], i, j}$
+
+Where:
+-   $A_{[b]}$ is an $M \times K$ matrix.
+-   $B_{[b]}$ is a $K \times N$ matrix.
+-   $C_{[b]}$ is the intermediate $M \times N$ result matrix for batch `b`.
+-   $D_{[b]}$ is the final $1 \times N$ output vector for batch `b`.
+-   $\bigoplus$ is a binary, associative reduction operator like sum, max, or min.
+
+The key optimization is that the intermediate matrix $C_{[b]}$ is never written to global memory. The reduction is fused directly into the GEMM kernel.
+
+## Algorithmic Strategy: Fused GEMM and Reduction
+
+The implementation fuses the reduction into the epilogue of a batched GEMM kernel. The batch dimension provides a natural axis for parallelism.
+
+1.  **Batch Scheduling**: The `BatchCount` GEMM problems are distributed across the GPU's thread blocks. Each block is assigned one or more GEMMs from the batch to compute.
+
+2.  **Tiled GEMM Core**: For each assigned GEMM, the thread block runs a standard tiled GEMM algorithm to compute the product $A_{[b]} \times B_{[b]}$. The result for each tile of $C_{[b]}$ is accumulated in the private registers of the threads.
+
+3.  **Fused Reduction Epilogue**: This is where the fusion occurs. Instead of writing the computed tile of $C_{[b]}$ to global memory, the threads use it as input for a parallel reduction.
+    -   **Intra-Block Reduction**: The threads within a block, which collectively hold the values for a tile of $C_{[b]}$, perform a local reduction. For example, to reduce along the M dimension, threads responsible for different M-rows but the same N-column will cooperate, using fast shared memory to sum their partial results.
+    -   **Inter-Block Reduction**: Since multiple thread blocks may be working on different M-tiles for the same batch item, their partial reduction results must be combined. Each block writes its partial sum to a designated location in the output vector `D`, using atomic operations (like `atomicAdd`) to safely accumulate the final result.
+
+This strategy completely eliminates the global memory traffic associated with the intermediate matrix `C`, which is often the largest tensor in the operation. This leads to substantial savings in memory bandwidth and improved performance.
+
+## Source Code Organization
+
+-   [`batched_gemm_reduce_xdl.cpp`](./batched_gemm_reduce_xdl.cpp): The main example file. It sets up the batched GEMM problem and instantiates the `DeviceBatchedGemmReduce` operation, specifying the reduction dimension and operator.
+-   [`../../include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce.hpp`](../../include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce.hpp): The high-level device interface for this fused operation.
+-   [`../../include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_reduce_xdl_cshuffle.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_reduce_xdl_cshuffle.hpp): The grid-wise kernel that implements the fused logic. It handles the batch scheduling, the tiled GEMM, and the fused reduction epilogue with atomic operations for inter-block communication.
+
+## Build and Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/18_batched_gemm_reduce
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the example with default settings
+./batched_gemm_reduce_xdl
+
+# Run with verification, data initialization, and timing
+./batched_gemm_reduce_xdl 1 2 1
+```
+
+## Applications
+
+This fused pattern is less common than simple GEMM+Bias but is highly effective for specific algorithms.
+
+-   **Gradient Computations**: In some complex neural network layers, the gradient calculation might involve a matrix product followed by a summation. For example, computing the gradient with respect to a bias term often involves summing the output gradients over the batch and spatial dimensions. If the output gradient itself is the result of a GEMM, this fused kernel could be applicable.
+-   **Custom Attention Mechanisms**: While standard attention involves a `softmax`, some research explores attention-like mechanisms that might use a simple sum or max reduction instead. If the query-key interaction is formulated as a batched GEMM, this kernel could compute the attention weights in a single, fused step.
+-   **Scientific Computing**: Certain numerical methods, particularly in physics or signal processing, may involve performing a linear transform (GEMM) on a set of signals (a batch) and then integrating the result (a reduction).
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -19,14 +19,19 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

 using F16 = ck::half_t;
 using F32 = float;

-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row    = ck::tensor_layout::gemm::RowMajor;
+using Col    = ck::tensor_layout::gemm::ColumnMajor;
+using Bypass = ck::tensor_layout::BypassLayoutVerification;

 using ADataType         = F16;
 using BDataType         = F16;
@@ -64,7 +69,7 @@ using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatc
 //######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|               |                |   MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
 //######|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|               |                |    Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
 //######|        |        |        |     |      |      |         |         |          |              |            |            |            |            |               |                |             |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal,  AElementOp,  BElementOp,  CElementOp, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal,  AElementOp,  BElementOp,  CElementOp, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4,             S<32, 8>,                         4,                            1>;
 // clang-format on

 using ReferenceBatchedGemmInstance =
@@ -137,11 +142,13 @@ int main(int argc, char* argv[])

        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
-            return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {row * stride, stride, 1_uz}, Bypass{});
        }
        else
        {
-            return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
+            return HostTensorDescriptor(
+                {batch_count, row, col}, {col * stride, 1_uz, stride}, Bypass{});
        }
    };

--- a/example/19_binary_elementwise/README.md
+++ b/example/19_binary_elementwise/README.md
@@ -0,0 +1,84 @@
+# Binary Elementwise Operations with Broadcasting
+
+This example demonstrates a generic binary elementwise operation, a fundamental building block in numerical computing. It covers two important cases:
+1.  **Simple Elementwise**: Applying a binary function to two input tensors of the *same* shape.
+2.  **Elementwise with Broadcasting**: Applying a binary function to two input tensors of *different but compatible* shapes.
+
+Broadcasting defines a set of rules for applying elementwise operations on tensors of different sizes, and it is a cornerstone of libraries like NumPy and TensorFlow.
+
+## Mathematical Formulation
+
+### Simple Elementwise
+Given two input tensors, A and B, of the same rank and dimensions, and a binary operator $\odot$, the operation computes an output tensor C where each element is:
+
+$C_{i,j,k,\dots} = A_{i,j,k,\dots} \odot B_{i,j,k,\dots}$
+
+### Elementwise with Broadcasting
+Broadcasting allows elementwise operations on tensors with different shapes, provided they are compatible. Two dimensions are compatible if they are equal, or if one of them is 1. The operation implicitly "stretches" or "duplicates" the tensor with the dimension of size 1 to match the other tensor's shape.
+
+For example, adding a bias vector `B` of shape `(1, N)` to a matrix `A` of shape `(M, N)`:
+$C_{i,j} = A_{i,j} + B_{0,j}$
+
+Here, the single row of `B` is broadcast across all `M` rows of `A`. The output tensor `C` has the shape `(M, N)`.
+
+Common binary elementwise operations include addition, subtraction, multiplication (Hadamard product), division, max, and min.
+
+## Algorithmic Strategy: Grid-Stride Loop with Broadcasting
+
+The implementation for both cases relies on the efficient **grid-stride loop**, which is adapted to handle broadcasting.
+
+1.  **Grid Partitioning**: The problem is mapped to a 1D grid of threads based on the number of elements in the **output** tensor.
+
+2.  **Grid-Stride Loop**: Each thread iterates through a subset of the output elements. For each output index, it must calculate the corresponding indices into the input tensors A and B.
+
+3.  **Broadcasting Logic**:
+-   The core of the broadcasting logic lies in the `get_broadcast_coord` function. If an input tensor's dimension is 1, the coordinate for that dimension is always set to 0, effectively reusing the same element across the broadcast dimension. If the dimension matches the output, the coordinate is passed through.
+-   This strategy ensures that memory accesses to the larger tensor remain coalesced, while accesses to the smaller, broadcasted tensor will naturally involve re-reading the same values, which is efficiently handled by the GPU's cache hierarchy.
+
+Like the simple case, broadcasted elementwise operations are almost always memory-bandwidth-bound.
+
+## Source Code Organization
+
+This example contains multiple files to demonstrate different scenarios:
+
+-   [`binary_elementwise_xdl.cpp`](./binary_elementwise_xdl.cpp): Demonstrates the simple case where both input tensors have the same shape.
+-   [`broadcast_add_2d_amn_bn.cpp`](./broadcast_add_2d_amn_bn.cpp): A specific example of broadcasting, adding a tensor of shape `(B, N)` to a tensor of shape `(A, M, N)`.
+-   [`../../include/ck/tensor_operation/gpu/device/device_elementwise.hpp`](../../include/ck/tensor_operation/gpu/device/device_elementwise.hpp): The high-level device interface. It is generic enough to handle both simple and broadcasted operations by correctly interpreting the tensor descriptors, which contain shape and stride information.
+-   [`../../include/ck/tensor_operation/gpu/grid/gridwise_elementwise.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_elementwise.hpp): The grid-wise kernel that implements the grid-stride loop. The tensor coordinate logic within this kernel correctly handles broadcasting based on the provided tensor descriptors.
+-   [`../../include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp`](../../include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp): Defines the various binary operator functors (like `Add`, `Multiply`, etc.).
+
+## Build and Run
+
+### Prerequisites
+
+Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
+
+### Build the Example
+```bash
+cd /path/to/composable_kernel/example/19_binary_elementwise
+mkdir build && cd build
+
+cmake \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
+  ..
+
+make -j
+```
+
+### Run the Example
+```bash
+# Run the simple elementwise example
+./binary_elementwise_xdl 1 2 1
+
+# Run the broadcasting example
+./broadcast_add_2d_amn_bn 1 2 1
+```
+
+## Applications
+
+Broadcasting is a powerful feature that makes code more concise and memory-efficient.
+-   **Adding Bias**: The most common use case in deep learning is adding a bias vector (shape `[N]`) to a matrix of activations (shape `[Batch, N]`).
+-   **Feature Scaling**: Multiplying a feature map (shape `[N, C, H, W]`) by a per-channel scaling factor (shape `[1, C, 1, 1]`).
+-   **Standardization**: In data preprocessing, subtracting the mean (a vector) and dividing by the standard deviation (another vector) from a data matrix.
+-   **Coordinate Grids**: Creating coordinate grids by adding a row vector `[0, 1, 2...]` to a column vector `[0, 1, 2...]^T`.
--- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
+++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
@@ -14,6 +14,10 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using F16 = ck::half_t;
 using F32 = float;

--- a/Show More
+++ b/Show More