Merge branch 'develop' into hstu_attention_mi350_fwd_bwd

This commit is contained in:
Qianfeng Zhang
2025-11-23 04:20:53 +00:00
3118 changed files with 208508 additions and 42460 deletions

View File

@@ -44,8 +44,7 @@ list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllv
example_compile_options(example_gemm_xdl_fp8_v3 PRIVATE ${GEMM_OPTIONS})
example_compile_options(example_gemm_xdl_bf16_v3 PRIVATE ${GEMM_OPTIONS})
list(APPEND gpu_list gfx942 gfx950)
list(APPEND gpu_list gfx942 gfx950 gfx1200 gfx1201 gfx12-generic)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
@@ -89,7 +88,14 @@ foreach(gpu IN LISTS GPU_TARGETS)
add_example_executable(example_gemm_xdl_lds_direct_load_fp16 gemm_xdl_lds_direct_load_fp16.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp16)
set(target 1)
endif()
endforeach()
list(APPEND gpu_list gfx90a gfx942 gfx950 gfx1200 gfx1201 gfx12-generic)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_example_executable(example_gemm_xdl_bf16_streamk_v3 gemm_xdl_bf16_streamk_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_streamk_v3)
@@ -99,6 +105,16 @@ foreach(gpu IN LISTS GPU_TARGETS)
endif()
endforeach()
list(APPEND gpu_list_tf32 gfx942 gfx950)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
add_example_executable(example_gemm_xdl_lds_direct_load_fp32_tf32 gemm_xdl_lds_direct_load_fp32_tf32.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_lds_direct_load_fp32_tf32)
set(target 1)
endif()
endforeach()
add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)

View File

@@ -1,27 +1,221 @@
# Instructions for ```example_gemm_xdl```
[Back to supported operations](../../../include/ck/README.md)
# Composable Kernel GEMM Example
## Introduction
GEMM (General Matrix Multiplication) is a fundamental operation in linear algebra and deep learning. It computes the product of two matrices, optionally adds a bias or residual, and is the core of many neural network layers (MLPs, attention, convolutions via im2col). This example demonstrates the flexible and high-performance GEMM API provided by Composable Kernel.
---
## Theory
**Mathematical Formulation:**
$$
C = \alpha (A \times B) + \beta D
$$
- $A$: [M, K] input matrix
- $B$: [K, N] weight matrix
- $D$: [M, N] optional bias/residual
- $C$: [M, N] output
- $\alpha, \beta$: scalars (often 1.0, 0.0)
GEMM is implemented using a tiled/blocking strategy to maximize data reuse and memory bandwidth. Modern GPU implementations use matrix core/XDL/MFMA instructions for high throughput. The operation is the computational backbone for transformer attention, MLPs, CNNs (via lowering), and more.
---
## CK GEMM API Overview
CK provides a highly composable GEMM API via the `DeviceGemm` family of device operations. These are highly templated to support a wide range of data types, layouts, and fused operations.
### Template Parameters
- **ALayout** - A matrix layout (RowMajor/ColumnMajor)
- **BLayout** - B matrix layout (RowMajor/ColumnMajor)
- **CLayout** - C matrix layout (RowMajor/ColumnMajor)
- **ADataType** - A matrix data type
- **BDataType** - B matrix data type
- **CDataType** - C matrix data type
- **AElementwiseOperation** - Fused operation on tensor A before GEMM
- **BElementwiseOperation** - Fused operation on tensor B before GEMM
- **CElementwiseOperation** - Fused operation on tensor C after GEMM
For large K dimension, use `DeviceGemmSplitK` to split K across workgroups (requires zeroing output buffer due to use of AtomicAdd).
For fused operations with additional tensors, use `DeviceGemmMultipleABD` or `DeviceGemmMultipleD`:
- **DsLayout** - layouts for additional tensors
- **DsDataType** - data types for additional tensors
For `DeviceGemmMultipleABD`, pass **ALayout**, **BLayout**, **ADataType**, **BDataType** as tuples.
---
## Supported GEMM Variants
- **DeviceGemm**: Standard GEMM
- **DeviceGemmSplitK**: Split-K GEMM for large K
- **DeviceGemmMultipleABD**: Fused GEMM with multiple A/B/D tensors
- **DeviceGemmMultipleD**: Fused GEMM with multiple D tensors
---
## Supported Device Operations
- **DeviceGemmDl**: DL instructions
- **DeviceGemmDpp**: DL instructions with DPP during data load
- **DeviceGemmWmma_CShuffle**: WMMA instructions with CShuffle optimization
- **DeviceGemm_Xdl_CShuffle_LdsDirectLoad**: XDL instructions, CShuffle, direct global-to-shared load
- **DeviceGemm_Xdl_CShuffle**: XDL instructions with CShuffle
- **DeviceGemm_Xdl_CShuffleV2**: XDL instructions, optimized pipeline vs. V1
- **DeviceGemmXdlSkipBLds**: XDL, skips shared memory load for B
- **DeviceGemm_Xdl_WaveletModel_CShuffle**: XDL, CShuffle, wavelet producer/consumer
- **DeviceGemmXdl**: XDL instructions
---
## Supported Data Types and Layouts
### XDL Instruction
| |Is supported|
|-------|---|
|bf16 |✔️|
|fp16 |✔️|
|fp32 |✔️|
|int8 |✔️|
|fp8 |✔️|
### WMMA Instruction
| |Is supported|
|-------|---|
|bf16 |✔️|
|fp16 |✔️|
|fp32 |❌|
|int8 |✔️|
|fp8 |❌|
### DL Instruction
| |Is supported|
|-------|---|
|bf16 |❌|
|fp16 |✔️|
|fp32 |✔️|
|int8 |✔️|
|fp8 |❌|
---
## Supported Fused Elementwise Operations
- **B Matrix Multiply + Add + Gelu** - bf16 (int8 for B matrix)
- **B Matrix Multiply + Add** - bf16 (int8 for B matrix)
- **B Matrix Multiply + Gelu** - bf16 (int8 for B matrix)
- **B Matrix Multiply** - bf16 (int8 for B matrix)
- **Add + Add + Gelu** - fp16
- **Add + Gelu** - fp16, bf16 (int8 for B matrix) for Row/Column/Row
- **Multiply** - fp16
- **Add + Multiply** - fp16
- **Add + Relu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
- **Add + Silu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
- **Add** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
- **Bilinear** - fp16, int8
- **Gelu** - fp16
- **Multiply + Add** - fp16 for Row/Column/Row and Row/Row/Row, fp16 (int8 for B matrix, fp32 for Bias) for Row/Column/Row and Row/Row/Row
- **Quantization** - int8
---
## GEMM V2 (Universal GEMM)
Optimized for MI300 series. Operation is called as `DeviceGemmV2` and uses similar template parameters as above.
- **ALayout**, **BLayout**, **CLayout**
- **ADataType**, **BDataType**, **CDataType**
- **AElementwiseOperation**, **BElementwiseOperation**, **CElementwiseOperation**
Split-K is supported (requires zeroing output buffer if splitK > 1).
### Device Operations
- **DeviceGemm_Xdl_CShuffleV3**: XDL with CShuffle optimization
- **DeviceGemm_Xdl_CShuffleV3R1**: XDL with CShuffle, reduction on split-K after GEMM
### Supported Types
| |Is supported|
|-------|---|
|bf16 |✔️|
|fp16 |✔️|
|fp32 |❌|
|int8 |❌|
|fp8 (C bf16)|✔️|
|fp16 (A fp8)|✔️|
|fp16 (B fp8)|✔️|
---
## Other GEMM Extensions
- **DeviceGemm_dequantB**: GEMM with dequantization (WMMA)
- **DeviceGemmMultipleD_ABScale**: GEMM with scale for A and B
- **DeviceGemmMultipleDLayernorm**: GEMM fused with layernorm
- **DeviceGemmMultipleDMultipleR**: GEMM fused with reductions and custom global reductions
- **DeviceGemmReduce**: GEMM fused with reduction
- **DeviceGemm_Streamk_V2**: Stream K with reduction instead of AtomicAdd
- **DeviceGemmStreamK**: Stream K using AtomicAdd
---
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
## Run ```example_gemm_xdl```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
./bin/example_gemm_xdl 0 1 5
cd composable_kernel/example/01_gemm
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
# Example run (FP16)
./gemm_xdl_fp16 -M 4096 -N 4096 -K 4096 -v 1 -t 1
```
# Instructions for ```example_gemm_xdl_fp16_streamk_v3```
---
## Source Code Structure
## Run ```example_gemm_xdl_fp16_streamk_v3```
```bash
arg1: verification (0=no, 1=yes)
arg2: initialization (0=no init, 1=integer value, 2=decimal value)
arg3: time kernel (0=no, 1=yes)
arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
arg10: stream-k select (-1: default config, 0: all DP, 1: 1-tile SK, 2: 2-tile SK)
arg11: Grid_size(-1 for max occupancy)
bin/example_gemm_xdl_fp16_streamk_v3 1 2 1 3840 4096 4096 4096 4096 4096 1 -1
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
problem {M:3840, N:4096, K:4096, SA:4096, SB:4096, SC:4096, MP:4032, NP:4096, KRead:4096, KP:4096, AK0:512, BK0:2048, MBlock: 18, NBlock: 16, Stream-K Selection:1, Grid size:-1}
Perf: 0.292022 ms, 441.23 TFlops, 330.348 GB/s, DeviceGemmXdlUniversal<MNPadding, RRR> BlkSize: 256, BlkTile: 224x256x64, WaveTile: 16x16, WaveMap: 7x8, VmemReadVec: 8x8, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2
```
example/01_gemm/
├── gemm_xdl_fp16.cpp # Main example: sets up, runs, and verifies GEMM (FP16)
├── gemm_xdl_fp32.cpp # Main example: FP32 variant
include/ck/tensor_operation/gpu/device/
│ └── device_gemm.hpp # Device-level GEMM API (templated)
include/ck/tensor_operation/gpu/device/impl/
│ └── device_gemm_xdl.hpp # XDL-based GEMM implementation
include/ck/tensor_operation/gpu/grid/
│ └── gridwise_gemm_xdl.hpp # Grid-level tiled GEMM kernel
include/ck/tensor_operation/gpu/block/
│ └── blockwise_gemm_xdl.hpp # Block-level tiled GEMM
library/reference_tensor_operation/cpu/
└── reference_gemm.hpp # CPU reference GEMM for correctness checking
```
### Key Classes and Functions
- **DeviceGemmXdl** (in `device_gemm.hpp`):
Main device API for launching GEMM kernels.
- **GridwiseGemmXdl** (in `gridwise_gemm_xdl.hpp`):
Implements the tiled/blocking GEMM kernel for the GPU grid.
- **BlockwiseGemmXdl** (in `blockwise_gemm_xdl.hpp`):
Handles block-level computation and shared memory tiling.
- **reference_gemm** (in `reference_gemm.hpp`):
CPU implementation for result verification.
---
This example is the foundation for all matrix operations in Composable Kernel and is the basis for more advanced fused and batched operations.

View File

@@ -25,6 +25,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/host_utility/kernel_launch.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
struct ProblemSize final
{
@@ -310,10 +315,14 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
return true;
}
template <typename DataType>
template <typename DataType, typename ComputeDataType = DataType>
inline __host__ __device__ constexpr double get_rtol()
{
if constexpr(std::is_same_v<DataType, float>)
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
@@ -351,10 +360,14 @@ inline __host__ __device__ constexpr double get_rtol()
}
}
template <typename DataType>
template <typename DataType, typename ComputeDataType = DataType>
inline __host__ __device__ constexpr double get_atol()
{
if constexpr(std::is_same_v<DataType, float>)
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<ComputeDataType, ck::tf32_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}

View File

@@ -26,17 +26,18 @@ using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuf
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
PassThrough, PassThrough, PassThrough, GemmDefault,
128,
128, 64,
64, 8, 8,
256,
128, 256, 64,
8, 8,
16, 16,
4, 2,
S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
2, 8,
S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 1, 8, 1,
S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 1, 8, 1,
1, 1, S<1, 32, 1, 4>, 8,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3>;
1, 1,
S<1, 64, 1, 4>, 8,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::

View File

@@ -13,7 +13,7 @@ using CDataType = ck::bhalf_t;
using ComputeTypeA = ck::f8_t;
using ComputeTypeB = ck::f8_t;
using ALayout = Row;
using ALayout = Col;
using BLayout = Col;
using CLayout = Row;
@@ -30,13 +30,13 @@ using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuf
PassThrough, PassThrough, PassThrough, GemmDefault,
128,
128, 64, 64,
8, 8,
16, 16, // AK1, BK1
16, 16,
4, 2,
S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 4, 16, 0,
S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
2, 16, 16, 0,
1, 1, S<1, 32, 1, 4>, 8,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1,
ComputeTypeA, ComputeTypeB>;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -27,7 +27,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -199,9 +199,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
ck::is_gfx11_supported() || ck::is_gfx12_supported()))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;
return true;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -37,7 +37,7 @@ using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffl
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 2, S<1, 16, 1, 16>, 8, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 2, S<1, 16, 1, 16>, 4, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
// clang-format on
using DeviceGemmInstance = DeviceGemmInstance1;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -30,7 +30,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Scheduler| Version| |
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, LoopSched, PipelineVer, ComputeType>;
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4, LoopSched, PipelineVer, ComputeType>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -249,9 +249,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
ck::is_gfx11_supported() || ck::is_gfx12_supported()))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;
return true;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -38,14 +38,14 @@ using DeviceGemmV2Instance =
AElementOp, BElementOp, CElementOp, GemmDefault,
256, Scale_Block_N, Scale_Block_K,
128, 128,
KPerBlock, 8, 32,
32, 32,
4, 1,
KPerBlock, 8, 16,
16, 16,
8, 2,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 32, 32, 0,
1, 1, S<1, 32, 1, 8>, 8,
2, 16, 16, 0,
1, 1, S<1, 16, 1, 16>, 4,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, CDataType, CDataType, PermuteA, PermuteB>;
// clang-format on
@@ -281,9 +281,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
ck::is_gfx11_supported() || ck::is_gfx12_supported()))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
std::cout << "This kernel support gfx942, gfx950, gfx11 and gfx12 only" << std::endl;
return true;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -33,13 +33,13 @@ using DeviceGemmInstance =
2, 256,
256, 256,
32, 8, 4,
32, 32,
4, 4,
16, 16,
8, 8,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 8, 4, 0,
1, 1, S<1, 32, 1, 8>, 8,
1, 1, S<1, 32, 1, 8>, 4,
ck::LoopScheduler::Default, ck::PipelineVersion::v1>;
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -31,7 +31,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Scheduler| Version| TypeA| TypeB|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8, LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4, LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
// this instance has been tested working on gfx950
// < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 128, 32, 32, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8, LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
// clang-format on
@@ -55,4 +55,12 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
int main(int argc, char* argv[])
{
if(ck::is_gfx11_supported())
{
return 0;
}
return !run_gemm_example(argc, argv);
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -31,7 +31,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Scheduler| Version| TypeA| TypeB|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8, LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4, LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
@@ -57,4 +57,12 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
int main(int argc, char* argv[])
{
if(ck::is_gfx11_supported())
{
return 0;
}
return !run_gemm_example(argc, argv);
}

View File

@@ -28,7 +28,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
static constexpr bool PermuteA = false;
static constexpr bool PermuteB = false;
static constexpr int KPack = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
// clang-format off
#if 0
using DeviceGemmV2Instance =
@@ -56,14 +56,14 @@ using DeviceGemmV2Instance =
AElementOp, BElementOp, CElementOp, GemmDefault,
256,
256, 256,
128, 16, 32,
32, 32,
4, 4,
128, 16, KPack,
16, 16,
8, 8,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 16, 16, 0,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 32, 32, 0,
1, 1, S<1, 32, 1, 8>, 8,
1, 1, S<1, 32, 1, 8>, 4,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, F8, F8, PermuteA, PermuteB>;
#endif
@@ -160,7 +160,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
auto gemm = DeviceGemmV2Instance{};
// weight pre-shuffle
int KPack = 32; // int4 -> 32, fp8 -> 16, fp16 -> 8
int NLane = gemm.GetPreShuffleParameters();
int KLane = 64 / NLane;
@@ -269,9 +268,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
ck::is_gfx12_supported()))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
std::cout << "This kernel support gfx942, gfx950 and gfx12 only" << std::endl;
return true;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -38,14 +38,14 @@ using DeviceGemmV2Instance =
AElementOp, BElementOp, CElementOp, GemmDefault,
256,
128, 128,
KPerBlock, 16, 32,
32, 32,
2, 2,
KPerBlock, 16, 16,
16, 16,
4, 4,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 16, 16, 0,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 32, 32, 0,
1, 1, S<1, 32, 1, 8>, 8,
2, 16, 16, 0,
1, 1, S<1, 32, 1, 8>, 4,
ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v2, ADataType, ADataType, PermuteA, PermuteB>;
// clang-format on
@@ -247,9 +247,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950" ||
ck::is_gfx12_supported()))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
std::cout << "This kernel support gfx942, gfx950 and gfx12 only" << std::endl;
return true;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -36,7 +36,7 @@ using DeviceGemmV2Instance =
2, 16, 16, 0,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 16, 16, 0,
1, 2, S<1, 32, 1, 8>, 8,
1, 2, S<1, 32, 1, 8>, 4,
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ck::f8_t>;
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -27,7 +27,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 16>;
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::

View File

@@ -0,0 +1,85 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "common.hpp"
#define USING_DIRECT_LOADS 1
#if USING_DIRECT_LOADS
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_lds_direct_load.hpp"
#else
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
#endif
#define EXAMPLE_WITH_COMPUTE_DATATYPE
using F32 = float;
using ADataType = F32;
using BDataType = F32;
using AccDataType = F32;
using CShuffleDataType = F32;
using CDataType = F32;
using ComputeDataType = ck::tf32_t;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
#if USING_DIRECT_LOADS
// clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer|
// ######| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockLds|
// ######| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| LoopScheduler | pipeline ver | gemm type |
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block|
// ######| XDL| XDL| Per| Per| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraM| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| | | PerVector| | Lengths_K0_N_K1| | | PerVector| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 128, 128, 32,
8, 8, 32, 32, 2, 2, S<4, 8, 8>, S<1, 0, 2>, 2, 1, 1, S<4, 8, 8>, S<1, 0, 2>, 2, 1, 1,
1, 1, S<1, 8, 1, 8>, 4, ck::LoopScheduler::Default, ck::PipelineVersion::v4, ComputeDataType>;
// clang-format on
#else
// clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 8, 1, 8>, 4>;
// clang-format on
#endif
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp,
ComputeDataType,
ComputeDataType>;
using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp>;
#include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
#undef EXAMPLE_WITH_COMPUTE_DATATYPE

View File

@@ -36,7 +36,7 @@ using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
#else
< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 4, 4, 7, 1>;
< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 128, 4, 4, 16, 16, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 4, 4, 7, 1>;
using ADataType = float;
using BDataType = float;
using CDataType = float;
@@ -185,7 +185,6 @@ int main(int argc, char* argv[])
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
@@ -209,8 +208,7 @@ int main(int argc, char* argv[])
return 0;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -29,7 +29,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_WaveletM
// ######| | | | Type| Type| Type| DataType| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| ThreadGroupSize| ThreadGroupSize| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, F16, CDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1,8>, 8>;
< ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, F16, CDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1,8>, 4>;
// clang-format on
using DeviceGemmInstance = DeviceGemmInstance;

View File

@@ -2,7 +2,11 @@
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/utility/validation_common.hpp"
// use macro to minimize code change
#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
using ComputeDataType = AccDataType;
#endif
template <typename ProblemType>
bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
@@ -24,11 +28,11 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
}
};
@@ -54,17 +58,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
try
{
ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
M, N, K, StrideA, StrideB, StrideC);
}
catch(const std::runtime_error& e)
{
std::cerr << "Error: " << e.what() << std::endl;
return false;
}
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
@@ -218,8 +211,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
get_rtol<CDataType, ComputeDataType>(),
get_atol<CDataType, ComputeDataType>());
#endif
}
@@ -249,8 +242,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_device_ref_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
get_rtol<CDataType, ComputeDataType>(),
get_atol<CDataType, ComputeDataType>());
}
return pass == true;

View File

@@ -1,6 +1,78 @@
# Instructions for ```example_gemm_bilinear_xdl_fp16```
# Composable Kernel GEMM Bilinear Example
## Introduction
This example demonstrates GEMM (General Matrix Multiplication) fused with bilinear operations on auxiliary tensors using Composable Kernel. Bilinear fusion patterns are widely used in neural networks for gating, attention, and multimodal feature fusion, where the output of a matrix multiplication is combined elementwise with one or more additional tensors.
---
## Theory
**Mathematical Formulation:**
$$
F = \text{BilinearOp}(A \times B, D, E)
$$
- $A$: [M, K] input matrix
- $B$: [K, N] weight matrix
- $D$, $E$: [M, N] auxiliary tensors (or broadcastable)
- $F$: [M, N] output
**Examples:**
- Elementwise: $F = (A \times B) \odot D \odot E$
- Gated: $F = (A \times B) \odot \sigma(D) + E$
- Weighted: $F = \alpha (A \times B) + \beta (D \odot E)$
The GEMM result is kept in registers and combined with auxiliary tensors in the epilogue, avoiding intermediate writes to global memory. This pattern is common in attention, gating, and feature interaction layers.
---
## CK GEMM Bilinear API Overview
CK provides a composable API for GEMM with multiple auxiliary tensors via the `DeviceGemmMultipleD` operation.
### Template Parameters
- **ALayout** - A matrix layout (RowMajor/ColumnMajor)
- **BLayout** - B matrix layout (RowMajor/ColumnMajor)
- **DsLayout** - Layouts for auxiliary tensors (tuple)
- **ELayout** - Output matrix layout (RowMajor/ColumnMajor)
- **ADataType** - A matrix data type
- **BDataType** - B matrix data type
- **DsDataType** - Data types for auxiliary tensors (tuple)
- **EDataType** - Output matrix data type
- **AElementwiseOperation** - Fused operation on tensor A before GEMM
- **BElementwiseOperation** - Fused operation on tensor B before GEMM
- **CDEElementwiseOperation** - Fused operation on C, D, E after GEMM
### Supported Data Types and Layouts
- Supports fp16, int8, and other types depending on the device operation.
- Supports RowMajor and ColumnMajor layouts for all tensors.
### Supported Device Operations
- **DeviceGemmMultipleD**: Standard multi-tensor GEMM
- **DeviceGemmMultipleD_Bilinear**: GEMM with bilinear fusion in the epilogue
---
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
cd composable_kernel/example/02_gemm_bilinear
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
```
### Run ```example_gemm_bilinear_xdl_fp16```
## Run ```example_gemm_bilinear_xdl_fp16```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -9,3 +81,35 @@
#arg11 to 12: alpha, beta
./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5
```
---
## Source Code Structure
```
example/02_gemm_bilinear/
├── gemm_bilinear_xdl.cpp # Main example: sets up, runs, and verifies GEMM with bilinear fusion
├── gemm_bilinear_wmma_fp16.cpp # WMMA FP16 variant
├── gemm_bilinear_wmma_int8.cpp # WMMA int8 variant
include/ck/tensor_operation/gpu/device/
│ └── device_gemm_multiple_d.hpp # Device-level API for multi-tensor GEMM
include/ck/tensor_operation/gpu/device/impl/
│ └── device_gemm_bilinear_impl.hpp # Bilinear operation implementation
include/ck/tensor_operation/gpu/grid/
│ └── gridwise_gemm_multiple_d.hpp # Grid-level multi-tensor GEMM kernel
include/ck/tensor_operation/gpu/element/
└── element_wise_operation.hpp # Elementwise operation definitions
```
### Key Classes and Functions
- **DeviceGemmMultipleD** (in `device_gemm_multiple_d.hpp`):
Device API for GEMM with multiple auxiliary tensors and fused epilogues.
- **gridwise_gemm_multiple_d** (in `gridwise_gemm_multiple_d.hpp`):
Implements the tiled/blocking GEMM kernel with multi-tensor epilogue.
- **element_wise_operation** (in `element_wise_operation.hpp`):
Defines bilinear and other elementwise operations.
---
This example demonstrates how Composable Kernel supports complex multi-tensor fusion patterns for advanced neural network architectures.

View File

@@ -19,6 +19,10 @@
#include "ck/library/utility/check_err.hpp"
#include "ck/host_utility/device_prop.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
struct AlphaBetaAdd
{
AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
@@ -43,8 +47,9 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -190,11 +195,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -19,6 +19,10 @@
#include "ck/library/utility/check_err.hpp"
#include "ck/host_utility/device_prop.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
struct AlphaBetaAdd
{
AlphaBetaAdd(int alpha, int beta) : alpha_(alpha), beta_(beta){};
@@ -43,8 +47,9 @@ using S = ck::Sequence<Is...>;
using I8 = std::int8_t;
using I32 = std::int32_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -190,11 +195,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -18,6 +18,10 @@
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
struct AlphaBetaAdd
{
AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
@@ -42,8 +46,9 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -87,10 +92,10 @@ using DeviceOpInstance =
32,
8,
8,
32,
32,
16,
16,
8,
4,
2,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
@@ -108,7 +113,7 @@ using DeviceOpInstance =
1,
1,
S<1, 32, 1, 8>,
8>;
4>;
int main(int argc, char* argv[])
{
@@ -173,7 +178,7 @@ int main(int argc, char* argv[])
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
"beta\n");
exit(0);
exit(1);
}
auto f_host_tensor_descriptor =
@@ -182,11 +187,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -1,10 +1,63 @@
# Instructions for ```example_gemm_bias_relu_xdl_fp16```
# GEMM with Bias and ReLU Activation Fusion
## Run ```example_gemm_bias_relu_xdl_fp16```
## Theory
This example demonstrates **GEMM fused with bias addition and ReLU activation**. This is the core pattern for fully connected (dense) neural network layers and the feed-forward blocks in transformers.
**Mathematical Formulation:**
$$
E = \text{ReLU}(A \times B + \text{bias})
$$
- $A$: [M, K] input matrix
- $B$: [K, N] weight matrix
- $\text{bias}$: [N] bias vector (broadcasted)
- $E$: [M, N] output
**Algorithmic Background:**
- The GEMM result is kept in registers, bias is added, and ReLU is applied before writing to global memory.
- This fusion eliminates intermediate memory traffic and is a standard optimization in deep learning frameworks.
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: time kernel (0=no, 1=yes)
#arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE
./bin/example_gemm_bias_relu_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096
cd composable_kernel/example/03_gemm_bias_relu
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
# Example run
./gemm_bias_relu_xdl -M 2048 -N 8192 -K 2048 --verify=1 --time=1
```
## Source Code Structure
### Directory Layout
```
example/03_gemm_bias_relu/
├── gemm_bias_relu_xdl.cpp # Main example: sets up, runs, and verifies GEMM+Bias+ReLU
include/ck/tensor_operation/gpu/device/
│ └── device_gemm_multiple_d.hpp # Device-level API for multi-tensor GEMM
include/ck/tensor_operation/gpu/device/impl/
│ └── device_gemm_xdl_cshuffle_v3.hpp # XDL with C-Shuffle epilogue
│ └── device_gemm_bias_relu_impl.hpp # Specialized bias+ReLU implementation
include/ck/tensor_operation/gpu/grid/
│ └── gridwise_gemm_xdl_cshuffle.hpp # Grid-level GEMM with epilogue
include/ck/tensor_operation/gpu/element/
└── element_wise_operation.hpp # Elementwise operation definitions
```
### Key Classes and Functions
- **DeviceGemmMultipleD** (in `device_gemm_multiple_d.hpp`):
Device API for GEMM with auxiliary tensors and fused epilogues.
- **gridwise_gemm_xdl_cshuffle** (in `gridwise_gemm_xdl_cshuffle.hpp`):
Implements the tiled/blocking GEMM kernel with fused epilogue.
- **element_wise_operation** (in `element_wise_operation.hpp`):
Defines bias addition and ReLU activation.
This example demonstrates the standard epilogue fusion concept that enables efficient neural network layers in modern deep learning.

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -19,14 +19,19 @@
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -83,10 +88,10 @@ using DeviceOpInstance =
32,
8,
8,
32,
32,
16,
16,
8,
4,
2,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
@@ -104,7 +109,7 @@ using DeviceOpInstance =
1,
1,
S<1, 32, 1, 8>,
8>;
4>;
int main(int argc, char* argv[])
{
@@ -113,13 +118,13 @@ int main(int argc, char* argv[])
bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t M = 1920;
ck::index_t N = 2048;
ck::index_t K = 2048;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideE = 4096;
ck::index_t StrideA = 2048;
ck::index_t StrideB = 2048;
ck::index_t StrideE = 2048;
if(argc == 1)
{
@@ -160,17 +165,19 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};
ck::index_t StrideD = 0;
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, 0, ELayout{}));
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
@@ -221,7 +228,7 @@ int main(int argc, char* argv[])
K,
StrideA,
StrideB,
std::array<ck::index_t, 1>{0},
std::array<ck::index_t, 1>{static_cast<int>(StrideD)},
StrideE,
a_element_op,
b_element_op,

View File

@@ -1,10 +1,70 @@
# Instructions for ```example_gemm_add_add_fastgelu_xdl_fp16```
# GEMM with Add, Add, and FastGELU Activation
## Run ```example_gemm_add_add_fastgelu_xdl_fp16```
## Theory
This example demonstrates a **GEMM operation fused with two addition operations and FastGELU activation**. This pattern is used in transformer feed-forward networks and other neural architectures where a linear transformation is followed by bias addition, residual addition, and a non-linear activation.
**Mathematical Formulation:**
$$
E = \text{FastGELU}((A \times B) + D_0 + D_1)
$$
- $A$: [M, K] input matrix
- $B$: [K, N] weight matrix
- $D_0$: [N] bias vector (broadcasted)
- $D_1$: [M, N] residual tensor
- $E$: [M, N] output
FastGELU is an efficient approximation of GELU:
$$
\text{FastGELU}(x) = x \cdot \sigma(1.702 \cdot x)
$$
where $\sigma$ is the sigmoid function.
**Algorithmic Background:**
- The GEMM result is kept in registers, bias and residual are added, and FastGELU is applied before writing to global memory.
- No intermediate results are written to global memory.
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: time kernel (0=no, 1=yes)
#arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1
cd composable_kernel/example/04_gemm_add_add_fastgelu
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
# Example run
./gemm_add_add_fastgelu_xdl -M 2048 -N 8192 -K 2048 --verify=1 --time=1
```
## Source Code Structure
### Directory Layout
```
example/04_gemm_add_add_fastgelu/
├── gemm_add_add_fastgelu_xdl.cpp # Main example: sets up, runs, and verifies GEMM+Add+Add+FastGELU
include/ck/tensor_operation/gpu/device/
│ └── device_gemm_multiple_d.hpp # Device-level API for multi-tensor GEMM
include/ck/tensor_operation/gpu/device/impl/
│ └── device_gemm_xdl_cshuffle_v3.hpp # XDL with C-Shuffle epilogue
│ └── device_gemm_fastgelu_impl.hpp # FastGELU-specific implementation
include/ck/tensor_operation/gpu/grid/
│ └── gridwise_gemm_multiple_d_xdl.hpp # Grid-level multi-stage GEMM
include/ck/tensor_operation/gpu/element/
└── element_wise_operation.hpp # Elementwise operation definitions
```
### Key Classes and Functions
- **DeviceGemmMultipleD** (in `device_gemm_multiple_d.hpp`):
Device API for GEMM with multiple auxiliary tensors and fused epilogues.
- **gridwise_gemm_multiple_d_xdl** (in `gridwise_gemm_multiple_d_xdl.hpp`):
Implements the tiled/blocking GEMM kernel with multi-stage epilogue.
- **element_wise_operation** (in `element_wise_operation.hpp`):
Defines FastGELU and other elementwise operations.
This example demonstrates how Composable Kernel supports complex multi-stage epilogue fusion for advanced neural network architectures.

View File

@@ -23,6 +23,10 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -1,4 +1,4 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -31,7 +31,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 4>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 2>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 16>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -6,18 +6,21 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
#endif
using namespace ck::literals;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
ProblemSize ps =
problem_size; // make mutable copy because default stride values of 0 need to be updated
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};
@@ -41,6 +44,30 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
// If any user-provided leading stride < 0, replace it with the one determined by the
// created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1.
auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int {
if constexpr(std::is_same_v<decltype(layout_tag), ck::tensor_layout::gemm::RowMajor>)
{
return static_cast<int>(tensor.GetStrides()[0]);
}
else
{
return static_cast<int>(tensor.GetStrides()[1]);
}
};
if(StrideA < 0)
StrideA = fetch_leading_stride(a_m_k, ALayout{});
if(StrideB < 0)
StrideB = fetch_leading_stride(b_k_n, BLayout{});
if(StrideD0 < 0)
StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{});
if(StrideD1 < 0)
StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{});
if(StrideE < 0)
StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{});
switch(config.init_method)
{
case 0: break;

View File

@@ -19,4 +19,13 @@ foreach(gpu IN LISTS GPU_TARGETS)
add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
set(target 1)
endif()
endforeach()
endforeach()
list(APPEND gpu_list_tf32 gfx942 gfx950)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
add_example_executable(example_convnd_fwd_xdl_fp32_tf32 convnd_fwd_xdl_fp32_tf32.cpp)
set(target 1)
endif()
endforeach()

View File

@@ -1,6 +1,42 @@
# Instructions for ```example_convnd_fwd_xdl```
# N-Dimensional Convolution Forward
## Theory
This example demonstrates the **N-dimensional convolution forward pass** using Composable Kernel. Convolution is a fundamental operation in deep learning, especially in convolutional neural networks (CNNs) for images, audio, and volumetric data.
**Mathematical Formulation:**
Given:
- Input tensor: $X[N, C_{in}, D_1, D_2, ..., D_n]$
- Weight tensor: $W[C_{out}, C_{in}, K_1, K_2, ..., K_n]$
- Output tensor: $Y[N, C_{out}, O_1, O_2, ..., O_n]$
The convolution computes:
$$
Y[n, c_{out}, o_1, ..., o_n] = \sum_{c_{in}} \sum_{k_1} ... \sum_{k_n} X[n, c_{in}, o_1 + k_1, ..., o_n + k_n] \cdot W[c_{out}, c_{in}, k_1, ..., k_n]
$$
Stride, padding, and dilation parameters control the mapping between input and output indices.
**Algorithmic Background:**
- Composable Kernel implements convolution as an implicit GEMM (matrix multiplication) for efficiency.
- The input and weight tensors are transformed into matrices, and the convolution is performed as a GEMM.
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
cd composable_kernel/example/09_convnd_fwd
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
```
### Run ```example_convnd_fwd_xdl```
## Run ```example_convnd_fwd_xdl```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -16,3 +52,29 @@
# <right padding>, (ie RightPy, RightPx for 2D)
./bin/example_convnd_fwd_xdl 0 1 100
```
## Source Code Structure
### Directory Layout
```
example/09_convnd_fwd/
├── convnd_fwd_xdl.cpp # Main example: sets up, runs, and verifies N-D convolution
include/ck/tensor_operation/gpu/device/
│ └── device_convnd_fwd.hpp # Device-level convolution API
include/ck/tensor_operation/gpu/device/impl/
│ └── device_convnd_fwd_xdl.hpp # XDL-based convolution implementation
include/ck/tensor_operation/gpu/grid/
│ └── gridwise_convnd_fwd_xdl.hpp # Grid-level convolution kernel
include/ck/tensor_operation/gpu/block/
└── blockwise_convnd_fwd_xdl.hpp # Block-level convolution
```
### Key Classes and Functions
- **DeviceConvNdFwd** (in `device_convnd_fwd.hpp`):
Device API for N-dimensional convolution.
- **gridwise_convnd_fwd_xdl** (in `gridwise_convnd_fwd_xdl.hpp`):
Implements the tiled/blocking convolution kernel.
- **blockwise_convnd_fwd_xdl** (in `blockwise_convnd_fwd_xdl.hpp`):
Handles block-level computation and shared memory tiling.
This example demonstrates how Composable Kernel implements efficient N-dimensional convolution using implicit GEMM, supporting a wide range of deep learning applications.

View File

@@ -19,6 +19,10 @@
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
void print_helper_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
@@ -27,10 +31,14 @@ void print_helper_msg()
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
}
template <typename DataType>
template <typename DataType, typename GemmType = DataType>
inline __host__ __device__ constexpr double get_rtol()
{
if constexpr(std::is_same_v<DataType, float>)
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
{
return 5e-3;
}
else if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
@@ -68,10 +76,14 @@ inline __host__ __device__ constexpr double get_rtol()
}
}
template <typename DataType>
template <typename DataType, typename GemmType = DataType>
inline __host__ __device__ constexpr double get_atol()
{
if constexpr(std::is_same_v<DataType, float>)
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
@@ -116,7 +128,8 @@ template <ck::index_t NDimSpatial,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp,
typename DeviceConvNDFwdInstance>
typename DeviceConvNDFwdInstance,
typename ComputeDataType = OutDataType>
bool run_grouped_conv_fwd(bool do_verification,
int init_method,
bool time_kernel,
@@ -228,7 +241,11 @@ bool run_grouped_conv_fwd(bool do_verification,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>();
OutElementOp,
0,
0,
0,
ComputeDataType>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in,
@@ -249,8 +266,8 @@ bool run_grouped_conv_fwd(bool do_verification,
return ck::utils::check_err(out_device,
out_host,
"Error: incorrect results!",
get_rtol<OutDataType>(),
get_atol<OutDataType>());
get_rtol<OutDataType, ComputeDataType>(),
get_atol<OutDataType, ComputeDataType>());
}
return true;

View File

@@ -19,6 +19,10 @@
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
void print_helper_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave
8, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
@@ -72,7 +72,7 @@ using DeviceGroupedConvNDFwdInstance =
1,
1,
S<1, 32, 1, 8>,
8>;
4>;
#include "run_convnd_fwd_example.inc"

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave
8, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
@@ -73,9 +73,17 @@ using DeviceGroupedConvNDFwdInstance =
1,
1,
S<1, 32, 1, 8>,
8,
4,
ComputeType>;
#include "run_convnd_fwd_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
int main(int argc, char* argv[])
{
// temp disable on gfx11
if(ck::is_gfx11_supported())
{
return 0;
}
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
@@ -53,10 +53,10 @@ using DeviceGroupedConvNDFwdInstance =
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave
8, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
@@ -74,10 +74,18 @@ using DeviceGroupedConvNDFwdInstance =
1,
1,
S<1, 32, 1, 8>,
8,
4,
AComputeType,
BComputeType>;
#include "run_convnd_fwd_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
int main(int argc, char* argv[])
{
// temp disable on gfx11
if(ck::is_gfx11_supported())
{
return 0;
}
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave
8, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
@@ -72,7 +72,7 @@ using DeviceGroupedConvNDFwdInstance =
1,
1,
S<1, 32, 1, 8>,
8>;
4>;
#include "run_convnd_fwd_example.inc"

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
@@ -52,10 +52,10 @@ using DeviceGroupedConvNDFwdInstance =
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave
8, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
@@ -73,9 +73,17 @@ using DeviceGroupedConvNDFwdInstance =
1,
1,
S<1, 32, 1, 8>,
8,
4,
ComputeType>;
#include "run_convnd_fwd_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
int main(int argc, char* argv[])
{
// fp8 are not supported on gfx11
if(ck::is_gfx11_supported())
{
return 0;
}
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
@@ -76,4 +76,11 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_convnd_fwd_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
int main(int argc, char* argv[])
{
if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
{
return 0;
}
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
}

View File

@@ -0,0 +1,89 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#define EXAMPLE_WITH_COMPUTE_DATATYPE
using InDataType = float;
using WeiDataType = float;
using AccDataType = float;
using CShuffleDataType = float;
using OutDataType = float;
using ComputeDataType = ck::tf32_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
NDimSpatial,
InLayout, // ALayout
WeiLayout, // BLayout
ck::Tuple<>, // DsLayout
OutLayout, // ELayout
InDataType, // ADataType
WeiDataType, // BDataType
AccDataType, // AccDataType
CShuffleDataType, // CShuffleDataType
ck::Tuple<>, // DsDataType
OutDataType, // EDataType
InElementOp, // AElementwiseOperation
WeiElementOp, // BElementwiseOperation
OutElementOp, // CDEElementwiseOperation
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, // NumGemmKPrefetchStage
256, // BlockSize
128, // MPerBlock
192, // NPerBlock
16, // KPerBlock
4, // AK1
4, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
3, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector
4, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<1, 16, 1, 16>, // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
4, // CDEBlockTransferScalarPerVector_NPerBlock
ComputeDataType, // AComputeDataType
ComputeDataType, // BComputeDataType
ck::LoopScheduler::Default, // LoopScheduler
1 // NumGroupsToMerge
>;
#include "run_convnd_fwd_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
#undef EXAMPLE_WITH_COMPUTE_DATATYPE

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
@@ -7,6 +7,8 @@
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#define EXAMPLE_WITH_COMPUTE_DATATYPE
using InDataType = ck::f8_t;
using WeiDataType = ck::f8_t;
using AccDataType = float;
@@ -52,10 +54,10 @@ using DeviceGroupedConvNDFwdInstance =
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave
8, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
@@ -73,9 +75,19 @@ using DeviceGroupedConvNDFwdInstance =
1,
1,
S<1, 32, 1, 8>,
8,
4,
ComputeDataType>;
#include "run_convnd_fwd_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
int main(int argc, char* argv[])
{
// temp disable on gfx11
if(ck::is_gfx11_supported())
{
return 0;
}
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
}
#undef EXAMPLE_WITH_COMPUTE_DATATYPE

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
@@ -53,10 +53,10 @@ using DeviceGroupedConvNDFwdInstance =
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave
8, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
@@ -74,10 +74,18 @@ using DeviceGroupedConvNDFwdInstance =
1,
1,
S<1, 32, 1, 8>,
8,
4,
AComputeType,
BComputeType>;
#include "run_convnd_fwd_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
int main(int argc, char* argv[])
{
// temp disable on gfx11
if(ck::is_gfx11_supported())
{
return 0;
}
return run_convnd_fwd_example(argc, argv) ? 0 : 1;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_common.hpp"
@@ -51,10 +51,10 @@ using DeviceGroupedConvNDFwdInstance =
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave
8, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
@@ -71,8 +71,8 @@ using DeviceGroupedConvNDFwdInstance =
1, // BBlockLdsExtraN
1,
1,
S<1, 64, 1, 4>,
16>;
S<1, 32, 1, 8>,
4>;
#include "run_convnd_fwd_example.inc"

View File

@@ -3,6 +3,11 @@
#pragma once
// use macro to minimize code change
#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
using ComputeDataType = AccDataType;
#endif
bool run_convnd_fwd_example(int argc, char* argv[])
{
print_helper_msg();
@@ -65,17 +70,17 @@ bool run_convnd_fwd_example(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>,
ComputeDataType>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
};
namespace ctc = ck::tensor_layout::convolution;

View File

@@ -0,0 +1,57 @@
# N-Dimensional Convolution with Multiple D and Multiple Reduce
## Theory
This example demonstrates **N-dimensional convolution forward** with support for multiple auxiliary tensors (D) and multiple reduction operations. This is useful for advanced neural network layers that require additional outputs or statistics alongside the main convolution result.
**Mathematical Formulation:**
- Input tensor: $X[N, C_{in}, D_1, D_2, ..., D_n]$
- Weight tensor: $W[C_{out}, C_{in}, K_1, K_2, ..., K_n]$
- Auxiliary tensors: $D_0, D_1, ...$ (various shapes)
- Output tensor: $Y[N, C_{out}, O_1, O_2, ..., O_n]$
- Reduction operations: e.g., sum, mean, max over specified axes
The convolution computes the standard output as well as additional outputs or statistics by applying reduction operations to the convolution result or auxiliary tensors.
**Algorithmic Background:**
- Composable Kernel implements this as an implicit GEMM with support for multiple auxiliary tensors and reductions in the epilogue.
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
cd composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
# Example run
./convnd_fwd_multiple_d_multiple_reduce_xdl --verify=1 --time=1
```
## Source Code Structure
### Directory Layout
```
example/10_convnd_fwd_multiple_d_multiple_reduce/
├── convnd_fwd_multiple_d_multiple_reduce_xdl.cpp # Main example: sets up, runs, and verifies N-D convolution with multiple D/reduce
include/ck/tensor_operation/gpu/device/
│ └── device_convnd_fwd_multiple_d_multiple_reduce.hpp # Device-level API for multi-D/multi-reduce convolution
include/ck/tensor_operation/gpu/device/impl/
│ └── device_convnd_fwd_multiple_d_multiple_reduce_impl.hpp # Implementation
include/ck/tensor_operation/gpu/grid/
└── gridwise_convnd_fwd_multiple_d_multiple_reduce.hpp # Grid-level kernel
```
### Key Classes and Functions
- **DeviceConvNdFwdMultipleDMultipleReduce** (in `device_convnd_fwd_multiple_d_multiple_reduce.hpp`):
Device API for N-dimensional convolution with multiple outputs and reductions.
- **gridwise_convnd_fwd_multiple_d_multiple_reduce** (in `gridwise_convnd_fwd_multiple_d_multiple_reduce.hpp`):
Implements the tiled/blocking convolution kernel with multi-output/reduce epilogue.
This example demonstrates how Composable Kernel supports advanced convolution patterns with multiple outputs and reductions in a single efficient kernel.

View File

@@ -26,6 +26,10 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
using BF16 = ck::bhalf_t;
using FP16 = ck::half_t;
using FP32 = float;
@@ -125,7 +129,7 @@ inline bool parse_cmd_args(int argc,
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
problem_size = ck::utils::conv::parse_conv_param(
num_dim_spatial, threshold_to_catch_partial_args, argv);
num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
}
else
{

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -15,4 +15,11 @@ using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
int main(int argc, char* argv[])
{
if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
{
return 0;
}
return !run_convnd_fwd_max_example(argc, argv);
}

View File

@@ -23,7 +23,7 @@ using RsGlobalReduceOp =
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
// clang-format off
template <ck::index_t NDimSpatial>
@@ -36,7 +36,7 @@ using DeviceInstance =
#ifdef BUILD_INT4_EXAMPLE
< NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, ConvSpec, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
#else
< NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, ConvSpec, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
< NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, ConvSpec, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<32, 8>, 4, 1>;
#endif
template <ck::index_t NDimSpatial>

View File

@@ -0,0 +1,57 @@
# N-Dimensional Convolution Forward with Bias
## Theory
This example demonstrates **N-dimensional convolution forward** with bias addition. This is a common pattern in convolutional neural networks (CNNs), where a bias term is added to each output channel after the convolution operation.
**Mathematical Formulation:**
$$
Y[n, c_{out}, o_1, ..., o_n] = \sum_{c_{in}} \sum_{k_1} ... \sum_{k_n} X[n, c_{in}, o_1 + k_1, ..., o_n + k_n] \cdot W[c_{out}, c_{in}, k_1, ..., k_n] + B[c_{out}]
$$
- $X$: [N, C_in, D1, D2, ..., Dn] input tensor
- $W$: [C_out, C_in, K1, K2, ..., Kn] weight tensor
- $B$: [C_out] bias tensor
- $Y$: [N, C_out, O1, O2, ..., On] output tensor
**Algorithmic Background:**
- Composable Kernel implements convolution as an implicit GEMM, with bias addition fused in the epilogue for efficiency.
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
cd composable_kernel/example/11_convnd_fwd_bias
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
# Example run
./convnd_fwd_bias_xdl --verify=1 --time=1
```
## Source Code Structure
### Directory Layout
```
example/11_convnd_fwd_bias/
├── convnd_fwd_bias_xdl.cpp # Main example: sets up, runs, and verifies N-D convolution with bias
include/ck/tensor_operation/gpu/device/
│ └── device_convnd_fwd_bias.hpp # Device-level convolution API with bias
include/ck/tensor_operation/gpu/device/impl/
│ └── device_convnd_fwd_bias_impl.hpp # Implementation
include/ck/tensor_operation/gpu/grid/
└── gridwise_convnd_fwd_bias.hpp # Grid-level kernel
```
### Key Classes and Functions
- **DeviceConvNdFwdBias** (in `device_convnd_fwd_bias.hpp`):
Device API for N-dimensional convolution with bias.
- **gridwise_convnd_fwd_bias** (in `gridwise_convnd_fwd_bias.hpp`):
Implements the tiled/blocking convolution kernel with bias epilogue.
This example demonstrates how Composable Kernel fuses bias addition into the convolution forward pass for efficient CNN layer implementation.

View File

@@ -1,6 +1,38 @@
# Instructions for ```example_reduce_blockwise```
# Parallel Reduction Operations
## Theory
This example demonstrates **parallel reduction operations** (e.g., sum, max, min, mean) over tensors. Reduction is a fundamental operation in deep learning for computing statistics (such as batch mean/variance), loss aggregation, and normalization.
**Mathematical Formulation:**
Given a tensor $X$ and a reduction axis $a$:
$$
Y = \text{reduce}_{a}(X)
$$
- For sum: $Y = \sum_{i \in a} X_i$
- For max: $Y = \max_{i \in a} X_i$
- For mean: $Y = \frac{1}{|a|} \sum_{i \in a} X_i$
**Algorithmic Background:**
- Reductions are implemented using parallel tree reduction or segmented reduction algorithms.
- Efficient reductions require careful memory access, synchronization, and sometimes numerically stable algorithms (e.g., Welford's for variance).
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
cd composable_kernel/example/12_reduce
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
```
## Run ```example_reduce_blockwise```
```bash
# -D <xxx> : input 3D/4D/5D tensor lengths
# -R <xxx> : reduce dimension ids
@@ -11,7 +43,8 @@
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
```
Result
Expected Result:
```
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
@@ -21,6 +54,7 @@ Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
```
## Run ```example_reduce_multiblock_atomic_add```
```bash
# -D <xxx> : input 3D/4D/5D tensor lengths
# -R <xxx> : reduce dimension ids
@@ -31,7 +65,7 @@ Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
```
Result
Expected Result
```
./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
Perf: 0 ms, inf GB/s, DeviceReduceMultiBlock<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
@@ -42,6 +76,7 @@ echo $?
# Instructions for ```example_reduce_blockwise_two_call```
## Run ```example_reduce_blockwise_two_call```
```bash
#arg1: verification (0=no, 1=yes(
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
@@ -49,7 +84,8 @@ echo $?
./bin/example_reduce_blockwise_two_call 1 2 1
```
Result
Expected Result:
```
./bin/example_reduce_blockwise_two_call 1 2 1
launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
@@ -60,3 +96,30 @@ Warm up 1 time
Start running 10 times...
Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
```
## Source Code Structure
### Directory Layout
```
example/12_reduce/
├── reduce_xdl.cpp # Main example: sets up, runs, and verifies reduction
include/ck/tensor_operation/gpu/device/
│ └── device_reduce.hpp # Device-level reduction API
include/ck/tensor_operation/gpu/device/impl/
│ └── device_reduce_impl.hpp # Implementation
include/ck/tensor_operation/gpu/grid/
│ └── gridwise_reduce.hpp # Grid-level reduction kernel
include/ck/tensor_operation/gpu/block/
└── blockwise_reduce.hpp # Block-level reduction
```
### Key Classes and Functions
- **DeviceReduce** (in `device_reduce.hpp`):
Device API for reductions.
- **gridwise_reduce** (in `gridwise_reduce.hpp`):
Implements the tiled/blocking reduction kernel.
- **blockwise_reduce** (in `blockwise_reduce.hpp`):
Handles block-level reduction and shared memory.
This example demonstrates how Composable Kernel implements efficient parallel reductions for deep learning and scientific computing.

View File

@@ -20,6 +20,10 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
using namespace ck;
using namespace ck::tensor_operation::device;
@@ -100,13 +104,13 @@ int main(int argc, char* argv[])
const std::array<int, 2> reduceDims = {3, 4};
// const std::array<int, 3> invariantDims = {0, 1, 2};
const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
// input lengths of the second reduction, which is also the output lengths of the first
// reduction
const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
const std::vector<size_t> outLengths = {64, 320, 80};
std::vector<size_t> outLengths = {64, 320, 80};
if(argc == 1)
{
@@ -114,11 +118,26 @@ int main(int argc, char* argv[])
init_method = 2;
time_kernel = true;
}
else if(argc == 4)
else if((argc == 4) || (argc == 9))
{
do_verify = static_cast<bool>(argv[1]);
init_method = atoi(argv[2]);
time_kernel = static_cast<bool>(atoi(argv[3]));
if(argc == 9)
{
inLengths_1[0] = atoi(argv[4]);
inLengths_1[1] = atoi(argv[5]);
inLengths_1[2] = atoi(argv[6]);
inLengths_1[3] = atoi(argv[7]);
inLengths_1[4] = atoi(argv[8]);
inLengths_2[0] = inLengths_1[0];
inLengths_2[1] = inLengths_1[1];
inLengths_2[2] = inLengths_1[2];
inLengths_2[3] = inLengths_1[3];
outLengths[0] = inLengths_1[0];
outLengths[1] = inLengths_1[1];
outLengths[2] = inLengths_1[2];
}
}
else
{

View File

@@ -1,6 +1,41 @@
# Instructions for ```example_pool2d_fwd``` Examples
# 2D Pooling Forward
## Theory
This example demonstrates the **2D pooling forward pass**, a key operation in convolutional neural networks (CNNs) for spatial downsampling. Pooling reduces the spatial dimensions of feature maps, providing translation invariance and reducing computation.
**Mathematical Formulation:**
Given input $X[N, C, H_{in}, W_{in}]$, pooling window $(k_H, k_W)$, stride $(s_H, s_W)$, and padding $(p_H, p_W)$:
- Output $Y[N, C, H_{out}, W_{out}]$
- $H_{out} = \left\lfloor \frac{H_{in} + 2p_H - k_H}{s_H} \right\rfloor + 1$
- $W_{out} = \left\lfloor \frac{W_{in} + 2p_W - k_W}{s_W} \right\rfloor + 1$
For each output position:
- **Max Pooling:** $Y_{n,c,h,w} = \max_{i,j} X_{n,c,h \cdot s_H + i, w \cdot s_W + j}$
- **Average Pooling:** $Y_{n,c,h,w} = \frac{1}{k_H k_W} \sum_{i,j} X_{n,c,h \cdot s_H + i, w \cdot s_W + j}$
**Algorithmic Background:**
- Each thread computes one or more output elements.
- Handles padding and boundary conditions.
- Optimizes memory access for bandwidth.
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
cd composable_kernel/example/13_pool2d_fwd
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
```
### Run ```example_pool2d_fwd_fp16```
## Run ```example_pool2d_fwd_fp16```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
@@ -9,7 +44,7 @@
./bin/example_pool2d_fwd_fp16 1 1 1
```
Result
Expected Result:
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
@@ -19,7 +54,8 @@ Start running 10 times...
Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
```
## Run ```example_pool2d_fwd_fp32```
### Run ```example_pool2d_fwd_fp32```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
@@ -29,8 +65,9 @@ Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
```
Result
```
Expected Result:
```bash
./bin/example_pool2d_fwd_fp32 1 1 1
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
@@ -39,3 +76,31 @@ Warm up 1 time
Start running 10 times...
Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
```
## Source Code Structure
### Directory Layout
```
example/13_pool2d_fwd/
├── pool2d_fwd_xdl.cpp # Main example: sets up, runs, and verifies 2D pooling
include/ck/tensor_operation/gpu/device/
│ └── device_pool_fwd.hpp # Device-level pooling API
include/ck/tensor_operation/gpu/device/impl/
│ └── device_pool2d_fwd_nhwc.hpp # NHWC layout optimization
│ └── device_pool2d_fwd_nchw.hpp # NCHW layout optimization
include/ck/tensor_operation/gpu/grid/
│ └── gridwise_pool_fwd.hpp # Grid-level pooling kernel
include/ck/tensor_operation/gpu/block/
└── blockwise_pool.hpp # Block-level pooling
```
### Key Classes and Functions
- **DevicePoolFwd** (in `device_pool_fwd.hpp`):
Device API for pooling.
- **gridwise_pool_fwd** (in `gridwise_pool_fwd.hpp`):
Implements the tiled/blocking pooling kernel.
- **blockwise_pool** (in `blockwise_pool.hpp`):
Handles block-level pooling and shared memory.
This example demonstrates how Composable Kernel implements efficient 2D pooling for CNNs and vision models.

View File

@@ -19,6 +19,10 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <typename InDataType,
typename OutDataType,
typename ComputeDataType,
@@ -78,12 +82,12 @@ bool pool_test(bool do_verification,
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
{
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz}, layout);
}
else if constexpr(ck::is_same<decltype(layout),
ck::tensor_layout::convolution::NHWC>::value)
{
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}, layout);
}
};

View File

@@ -1,3 +1,4 @@
add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
add_example_executable(example_gemm_wmma_quantization_int8 gemm_wmma_quantization_int8.cpp)
add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)

View File

@@ -0,0 +1,60 @@
# GEMM with Quantization
## Theory
This example demonstrates **GEMM (General Matrix Multiplication) with quantized inputs or weights**. Quantization is a technique to reduce memory and computation by representing values with lower-precision integer types (e.g., int8), commonly used for efficient inference in deep learning.
**Mathematical Formulation:**
- Quantized GEMM: $C = \text{dequant}(A_q) \times \text{dequant}(B_q)$
- $A_q$, $B_q$: quantized matrices (e.g., int8)
- $\text{dequant}(x_q) = (x_q - z) \cdot s$ (scale $s$, zero-point $z$)
- $C$: output matrix (often in higher precision, e.g., float32 or float16)
**Algorithmic Background:**
- Quantized values are dequantized on-the-fly during GEMM computation.
- Accumulation is performed in higher precision for accuracy.
- Supports symmetric and asymmetric quantization.
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
cd composable_kernel/example/14_gemm_quantization
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
# Example run
./gemm_quantization_xdl --verify=1 --time=1
```
## Source Code Structure
### Directory Layout
```
example/14_gemm_quantization/
├── gemm_quantization_xdl.cpp # Main example: sets up, runs, and verifies quantized GEMM
include/ck/tensor_operation/gpu/device/
│ └── device_gemm_quantized.hpp # Device-level quantized GEMM API
include/ck/tensor_operation/gpu/device/impl/
│ └── device_gemm_quantized_impl.hpp # Implementation
include/ck/tensor_operation/gpu/grid/
│ └── gridwise_gemm_quantized.hpp # Grid-level quantized GEMM kernel
include/ck/tensor_operation/gpu/element/
└── quantization_operations.hpp # Quantization/dequantization utilities
```
### Key Classes and Functions
- **DeviceGemmQuantized** (in `device_gemm_quantized.hpp`):
Device API for quantized GEMM.
- **gridwise_gemm_quantized** (in `gridwise_gemm_quantized.hpp`):
Implements the tiled/blocking quantized GEMM kernel.
- **quantization_operations** (in `quantization_operations.hpp`):
Defines quantization and dequantization functions.
This example demonstrates how Composable Kernel supports efficient quantized matrix multiplication for deep learning inference.

View File

@@ -19,6 +19,10 @@
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -115,12 +119,14 @@ int main()
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1_uz}));
std::vector<std::size_t>({stride, 1_uz}),
layout);
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1_uz, stride}));
std::vector<std::size_t>({1_uz, stride}),
layout);
}
};

View File

@@ -0,0 +1,215 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using I8 = int8_t;
using I32 = int32_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = PassThrough;
using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
using ADataType = I8;
using BDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I32;
using DsDataType = ck::Tuple<>;
using EDataType = I8;
using ALayout = Col;
using BLayout = Row;
using DsLayout = ck::Tuple<>;
using ELayout = Row;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffleV3<
ALayout,
BLayout,
DsLayout,
ELayout,
ADataType,
BDataType,
DsDataType,
EDataType,
AccDataType,
CShuffleDataType,
ActivationOp,
ActivationOp,
CDEElementOp,
GemmDefault,
256,
128,
128,
64,
8,
8,
16,
16,
4,
2,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
1,
1,
S<1, 32, 1, 8>,
S<1>,
ck::BlockGemmPipelineScheduler::Intrawave,
ck::BlockGemmPipelineVersion::v1,
I8,
I8>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
int main(int /* argc */, char* /* argv */[])
{
bool do_verification = true;
bool time_kernel = false;
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1024;
ck::index_t K = 1024;
ck::index_t StrideA = K;
ck::index_t StrideB = N;
ck::index_t StrideE = N;
float requant_scale = 0.03;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = PassThrough{};
auto b_element_op = PassThrough{};
auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
// device GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
std::array<const void*, 0>{},
static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 0>{},
StrideE,
1,
a_element_op,
b_element_op,
cde_element_op);
if(!gemm.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
if(do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
ref_invoker.Run(ref_argument);
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
}
return 0;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -19,6 +19,10 @@
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -70,10 +74,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
64, // KPerBlock,
16, // AK1,
16, // BK1,
32, // MPerXDL,
32, // NPerXDL,
4, // MXdlPerWave,
2, // NXdlPerWave,
16, // MPerXDL,
16, // NPerXDL,
8, // MXdlPerWave,
4, // NXdlPerWave,
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1,
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
S<1, 0, 2>, // ABlockTransferSrcAccessOrder,
@@ -90,8 +94,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
1, // bool BBlockLdsExtraN,
1, // index_t CShuffleMXdlPerWavePerShuffle,
1, // index_t CShuffleNXdlPerWavePerShuffle,
S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
8>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
S<1, 32, 1, 8>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
4>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -19,6 +19,10 @@
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -68,10 +72,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
64, // KPerBlock,
16, // AK1,
16, // BK1,
32, // MPerXDL,
32, // NPerXDL,
4, // MXdlPerWave,
2, // NXdlPerWave,
16, // MPerXDL,
16, // NPerXDL,
8, // MXdlPerWave,
4, // NXdlPerWave,
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1,
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
S<1, 0, 2>, // ABlockTransferSrcAccessOrder,
@@ -88,8 +92,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
1, // bool BBlockLdsExtraN,
1, // index_t CShuffleMXdlPerWavePerShuffle,
1, // index_t CShuffleNXdlPerWavePerShuffle,
S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
S<1, 32, 1, 8>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
4>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::

View File

@@ -33,3 +33,13 @@ if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)
add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4)
endif()
list(APPEND gpu_list_tf32 gfx942 gfx950)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
add_example_executable(example_grouped_gemm_xdl_fp32_tf32 grouped_gemm_xdl_fp32_tf32.cpp)
add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32_tf32)
set(target 1)
endif()
endforeach()

View File

@@ -1,9 +1,64 @@
# Instructions for ```example_grouped_gemm_xdl```
# Grouped GEMM
## Theory
This example demonstrates **grouped GEMM**: performing multiple independent GEMM operations (with potentially different shapes) in a single kernel launch. Grouped GEMM is used in transformer models (e.g., multi-head attention), mixture-of-experts, and other architectures requiring heterogeneous batched matrix multiplications.
**Mathematical Formulation:**
For $G$ groups, each with its own $A_g$, $B_g$, $C_g$:
$$
C_g = A_g \times B_g \quad \text{for} \quad g = 1, 2, ..., G
$$
- $A_g$: [M_g, K_g] input matrix for group $g$
- $B_g$: [K_g, N_g] weight matrix for group $g$
- $C_g$: [M_g, N_g] output matrix for group $g$
**Algorithmic Background:**
- Each group can have different matrix sizes and strides.
- The kernel launches a grid covering all groups, with each block assigned to a group.
- Useful for variable-length sequences, multi-head attention, and expert routing.
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
cd composable_kernel/example/15_grouped_gemm
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
```
### Run ```example_grouped_gemm_xdl```
## Run ```example_grouped_gemm_xdl```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
./bin/example_grouped_gemm_xdl_fp16 0 1 5
```
## Source Code Structure
### Directory Layout
```
example/15_grouped_gemm/
├── grouped_gemm_xdl.cpp # Main example: sets up, runs, and verifies grouped GEMM
include/ck/tensor_operation/gpu/device/
│ └── device_grouped_gemm_xdl.hpp # Device-level grouped GEMM API
include/ck/tensor_operation/gpu/grid/
│ └── gridwise_grouped_gemm_xdl.hpp # Grid-level grouped GEMM kernel
```
### Key Classes and Functions
- **DeviceGroupedGemmXdl** (in `device_grouped_gemm_xdl.hpp`):
Device API for grouped GEMM.
- **gridwise_grouped_gemm_xdl** (in `gridwise_grouped_gemm_xdl.hpp`):
Implements the tiled/blocking grouped GEMM kernel.
This example demonstrates how Composable Kernel supports efficient heterogeneous batched matrix multiplication for advanced AI/ML workloads.

View File

@@ -25,6 +25,11 @@
#include "ck/utility/tuple.hpp"
#include "ck/utility/sequence.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

View File

@@ -23,6 +23,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -23,6 +23,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -63,7 +68,7 @@ using DeviceGemmInstance =
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, S<4,4,4>>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, S<4,4,4>>;
// clang-format on
struct ProblemSize final

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -19,6 +19,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -54,7 +59,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
#include "run_grouped_gemm_example.inc"

View File

@@ -20,6 +20,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -323,6 +328,31 @@ int main(int argc, char* argv[])
problem_size.Ms = {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
if(argc == 5)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.k_batch = std::stoi(argv[4]);
}
else if(argc == 6)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.k_batch = std::stoi(argv[4]);
problem_size.group_count = std::stoi(argv[5]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: k_batch (>0)\n");
printf("arg5: group count (default=16)");
exit(0);
}
for(int i = 0; i < problem_size.group_count; i++)
{
problem_size.Ns.push_back(768);
@@ -333,21 +363,5 @@ int main(int argc, char* argv[])
problem_size.stride_Cs.push_back(problem_size.Ns[i]);
}
if(argc == 5)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.k_batch = std::stoi(argv[4]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: k_batch (>0)\n");
exit(0);
}
return !run_grouped_gemm(problem_size, config);
}

View File

@@ -20,6 +20,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -296,6 +301,32 @@ int main(int argc, char* argv[])
problem_size.group_count = 16;
if(argc == 5)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.k_batch = std::stoi(argv[4]);
}
else if(argc == 6)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.k_batch = std::stoi(argv[4]);
problem_size.group_count = std::stoi(argv[5]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: k_batch (> 0)\n");
printf("arg5: group count (default=16)");
exit(0);
}
for(int i = 0; i < problem_size.group_count; i++)
{
problem_size.Ms.push_back(128 + rand() % 128);
@@ -307,21 +338,5 @@ int main(int argc, char* argv[])
problem_size.stride_Cs.push_back(problem_size.Ns[i]);
}
if(argc == 5)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.k_batch = std::stoi(argv[4]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: k_batch (> 0)\n");
exit(0);
}
return !run_grouped_gemm(problem_size, config);
}

View File

@@ -20,6 +20,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -297,6 +302,31 @@ int main(int argc, char* argv[])
problem_size.group_count = 16;
if(argc == 5)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.k_batch = std::stoi(argv[4]);
}
else if(argc == 6)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.k_batch = std::stoi(argv[4]);
problem_size.group_count = std::stoi(argv[5]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: k_batch (> 0)\n");
printf("arg5: group count (default=16)");
exit(0);
}
for(int i = 0; i < problem_size.group_count; i++)
{
problem_size.Ms.push_back(256 + 256 * i);
@@ -308,21 +338,5 @@ int main(int argc, char* argv[])
problem_size.stride_Cs.push_back(problem_size.Ns[i]);
}
if(argc == 5)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.k_batch = std::stoi(argv[4]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: k_batch (> 0)\n");
exit(0);
}
return !run_grouped_gemm(problem_size, config);
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -19,6 +19,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -54,7 +59,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
#include "run_grouped_gemm_example.inc"

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -19,6 +19,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -54,9 +59,16 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 4>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 16, 4, 4, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 2>;
// clang-format on
#include "run_grouped_gemm_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
int main(int argc, char* argv[])
{
if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
{
return 0;
}
return !run_grouped_gemm_example(argc, argv);
}

View File

@@ -0,0 +1,71 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#define EXAMPLE_WITH_COMPUTE_DATATYPE
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = F32;
using BDataType = F32;
using AccDataType = F32;
using CShuffleDataType = F32;
using DsDataType = ck::Tuple<>;
using EDataType = F32;
using ComputeDataType = ck::tf32_t;
using ALayout = Row;
using BLayout = Col;
using DsLayout = ck::Tuple<>;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
// clang-format off
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4, ck::LoopScheduler::Default, ComputeDataType>;
// clang-format on
#include "run_grouped_gemm_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
#undef EXAMPLE_WITH_COMPUTE_DATATYPE

View File

@@ -19,6 +19,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -19,6 +19,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -51,7 +56,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
#include "run_grouped_gemm_example.inc"

View File

@@ -19,6 +19,11 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::hip_check_error;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -66,6 +71,28 @@ int main(int argc, char* argv[])
problem_size.group_count = 16;
if(argc == 4)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else if(argc == 5)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
problem_size.group_count = std::stoi(argv[4]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: group count (default=16)");
exit(0);
}
for(int i = 0; i < problem_size.group_count; i++)
{
problem_size.Ms.push_back(256 + 256 * i);
@@ -77,19 +104,5 @@ int main(int argc, char* argv[])
problem_size.stride_Cs.push_back(problem_size.Ns[i]);
}
if(argc == 4)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
exit(0);
}
return !run_grouped_gemm(problem_size, config);
}

View File

@@ -3,6 +3,11 @@
#pragma once
// use macro to minimize code change
#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
using ComputeDataType = AccDataType;
#endif
struct ProblemSize final
{
std::vector<ck::index_t> Ms;
@@ -231,7 +236,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
AccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
CDEElementOp,
ComputeDataType>;
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
@@ -253,7 +259,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
pass &= ck::utils::check_err(c_device_result_converted, c_host_tensors[i]);
#else
pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
pass &= ck::utils::check_err<decltype(c_device_tensors[i]),
decltype(c_host_tensors[i]),
ComputeDataType>(c_device_tensors[i], c_host_tensors[i]);
#endif
}
}
@@ -278,28 +286,20 @@ bool run_grouped_gemm_example(int argc, char* argv[])
problem_size.group_count = 16;
for(int i = 0; i < problem_size.group_count; i++)
if(argc == 1)
{
problem_size.Ms.push_back(256 + 256 * i);
problem_size.Ns.push_back(128 + 128 * i);
problem_size.Ks.push_back(128 + 64 * i);
problem_size.stride_As.push_back(problem_size.Ks[i]);
problem_size.stride_Bs.push_back(problem_size.Ks[i]);
problem_size.stride_Cs.push_back(problem_size.Ns[i]);
// use default cases
}
if(argc == 4)
else if(argc == 4 || argc == 6)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else if(argc == 5)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.async_hargs = std::stoi(argv[4]);
if(argc == 6)
{
config.async_hargs = std::stoi(argv[4]);
problem_size.group_count = std::stoi(argv[5]);
}
}
else
{
@@ -307,7 +307,34 @@ bool run_grouped_gemm_example(int argc, char* argv[])
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: async hargs (0=n0, 1=yes)\n");
exit(0);
printf("arg5: group count (default=16)");
exit(1);
}
// Lambda to get stride based on layout
auto get_stride = [](auto layout, auto row_dim, auto col_dim) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return col_dim;
}
else
{
return row_dim;
}
};
for(int i = 0; i < problem_size.group_count; i++)
{
problem_size.Ms.push_back(256 + 256 * i);
problem_size.Ns.push_back(128 + 128 * i);
problem_size.Ks.push_back(128 + 64 * i);
problem_size.stride_As.push_back(
get_stride(ALayout{}, problem_size.Ms[i], problem_size.Ks[i]));
problem_size.stride_Bs.push_back(
get_stride(BLayout{}, problem_size.Ks[i], problem_size.Ns[i]));
problem_size.stride_Cs.push_back(
get_stride(ELayout{}, problem_size.Ms[i], problem_size.Ns[i]));
}
return run_grouped_gemm(problem_size, config);

View File

@@ -0,0 +1,56 @@
# GEMM with Multiple D and Multiple Reductions
## Theory
This example demonstrates **GEMM with multiple auxiliary tensors (D) and multiple reduction operations**. This pattern is used in advanced neural network layers that require additional outputs or statistics (such as sums, means, or other reductions) alongside the main GEMM result.
**Mathematical Formulation:**
- For each GEMM: $C = A \times B$
- Auxiliary tensors: $D_0, D_1, ...$ (various shapes)
- Reductions: e.g., sum, mean, max over specified axes or outputs
The kernel computes the main GEMM output and additional reductions or statistics in a single pass.
**Algorithmic Background:**
- The GEMM result is kept in registers, auxiliary tensors are fused in the epilogue, and reductions are computed as part of the output.
- Useful for multi-task learning, attention statistics, and custom neural network layers.
## How to Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build and run
```bash
cd composable_kernel/example/16_gemm_multi_d_multi_reduces
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ..
make -j
# Example run
./gemm_multi_d_multi_reduces_xdl --verify=1 --time=1
```
## Source Code Structure
### Directory Layout
```
example/16_gemm_multi_d_multi_reduces/
├── gemm_multi_d_multi_reduces_xdl.cpp # Main example: sets up, runs, and verifies GEMM with multi-D/multi-reduce
include/ck/tensor_operation/gpu/device/
│ └── device_gemm_multi_d_multi_reduces.hpp # Device-level API for multi-D/multi-reduce GEMM
include/ck/tensor_operation/gpu/device/impl/
│ └── device_gemm_multi_d_multi_reduces_impl.hpp # Implementation
include/ck/tensor_operation/gpu/grid/
└── gridwise_gemm_multi_d_multi_reduces.hpp # Grid-level kernel
```
### Key Classes and Functions
- **DeviceGemmMultiDMultiReduces** (in `device_gemm_multi_d_multi_reduces.hpp`):
Device API for GEMM with multiple outputs and reductions.
- **gridwise_gemm_multi_d_multi_reduces** (in `gridwise_gemm_multi_d_multi_reduces.hpp`):
Implements the tiled/blocking GEMM kernel with multi-output/reduce epilogue.
This example demonstrates how Composable Kernel supports advanced GEMM patterns with multiple outputs and reductions in a single efficient kernel.

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -19,6 +19,10 @@
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -76,7 +80,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
//######| | | | Type| Type| Type| DataType| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
//######| | | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _MPerBlock_NPerBlock| ScalarPerVector| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | _NPerBlock| |
< ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
< ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<32, 8>, 4, 1>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
@@ -8,6 +8,10 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
// DataType
using ADataType = INT8;
using BDataType = INT8;
@@ -72,10 +76,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
8, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
@@ -92,7 +96,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
S<32, 8>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
@@ -7,6 +7,10 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
// DataType
using ADataType = BF16;
using BDataType = BF16;
@@ -65,10 +69,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
8, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
@@ -85,7 +89,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
S<32, 8>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
@@ -7,6 +7,10 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
// DataType
using ADataType = F16;
using BDataType = F16;
@@ -65,10 +69,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
8, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
@@ -85,7 +89,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
S<32, 8>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
@@ -7,6 +7,10 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
// DataType
using ADataType = F32;
using BDataType = F32;
@@ -146,6 +150,11 @@ int main(int argc, char* argv[])
exit(0);
}
if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
{
return 0;
}
return run_gemm_reduce_max_xdl<ADataType,
BDataType,
EDataType,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
@@ -7,6 +7,10 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
using ADataType = INT4;
using ADataKernelType = INT8;
using BDataType = INT4;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
@@ -7,6 +7,10 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
using ADataType = INT8;
using BDataType = INT8;
using GemmAccDataType = INT32;
@@ -64,10 +68,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
8, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
@@ -84,7 +88,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
S<32, 8>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
@@ -7,6 +7,10 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
// DataType
using ADataType = BF16;
using BDataType = BF16;
@@ -72,10 +76,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
8, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
@@ -92,7 +96,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
S<32, 8>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
@@ -7,6 +7,10 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
// DataType
using ADataType = F16;
using BDataType = F16;
@@ -72,10 +76,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
16, // MPerXdl
16, // NPerXdl
8, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
@@ -92,7 +96,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultip
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
S<32, 8>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
@@ -7,6 +7,10 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
// DataType
using ADataType = F32;
using BDataType = F32;
@@ -153,6 +157,11 @@ int main(int argc, char* argv[])
exit(EXIT_SUCCESS);
}
if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
{
exit(EXIT_SUCCESS);
}
return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
BDataType,
EDataType,

View File

@@ -18,6 +18,10 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

View File

@@ -1,6 +1,62 @@
# Instructions for ```example_convnd_bwd_data_xdl```
# N-Dimensional Convolution Backward Pass for Data
## Run ```example_example_convnd_bwd_data_xdl```
This example demonstrates the backward data pass of an N-dimensional convolution, often denoted as `conv_bwd_data`. This operation is a crucial part of the backpropagation algorithm for training Convolutional Neural Networks (CNNs). Its purpose is to compute the gradient of the loss function with respect to the convolution's *input data*, which is then passed back to the preceding layer in the network.
## Mathematical Formulation
The backward data pass computes the gradient $\frac{\partial L}{\partial \text{In}}$, given the gradient from the subsequent layer, $\frac{\partial L}{\partial \text{Out}}$, and the filter weights `W` used in the forward pass.
Let the forward convolution be defined as:
$\text{Out} = \text{In} \star W$
The backward data pass is mathematically equivalent to a "full" convolution between the output gradient tensor `dL/dOut` and the 180-degree rotated (or transposed and flipped) weight tensor `W`.
$\frac{\partial L}{\partial \text{In}} = \frac{\partial L}{\partial \text{Out}} \star \text{rot180}(W)$
This operation propagates the error signal from the output back to the input, weighted by the same filters that were used in the forward pass.
## Algorithmic Strategy: Implicit GEMM
As with the forward pass, the most efficient way to implement the backward data pass on a GPU is to transform the convolution into a General Matrix-Matrix Multiplication (GEMM) problem.
1. **Output Gradient Reshaping**: The output gradient tensor `dL/dOut` is logically reshaped into a matrix `dL/dOut'` of shape `[K, (N*Ho*Wo)]`. This becomes the "A" matrix in the GEMM.
2. **Weight Reshaping**: The weight tensor `W` is logically reshaped into a matrix `W'` of shape `[K, (C*Y*X)]`. This becomes the "B" matrix in the GEMM.
3. **Implicit GEMM**: The core computation is then formulated as a GEMM operation. However, the output of this GEMM is not a simple matrix; it's the `dL/dIn` tensor.
$(\text{dL/dIn})' = (W')^T \times (\text{dL/dOut})'$
The key insight is that this operation can be performed without explicitly forming the matrices. The GEMM kernel is designed to read from `dL/dOut` and `W` and write its results directly to the appropriate locations in the `dL/dIn` tensor. This process is sometimes referred to as an "implicit `col2im`" (column-to-image), as it is the inverse of the `im2col` transformation used in the forward pass.
This "implicit GEMM" approach is highly efficient. It avoids the massive memory and bandwidth overhead of materializing intermediate matrices, which is critical for performance.
## Source Code Organization
- [`conv_bwd_data_xdl.cpp`](./conv_bwd_data_xdl.cpp): The main example file that defines the parameters for a 2D convolution and instantiates the generic `DeviceConvNdBwdData` kernel to compute the input gradients.
- [`../../include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp`](../../include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp): The high-level device interface for the backward data convolution. It is templated on the dimensionality, layouts, and data types of the problem.
- [`../../include/ck/tensor_operation/gpu/grid/gridwise_gemm_implicit_gemm_v1r2_xdlops_nchw_kcyx_nkhw.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_gemm_implicit_gemm_v1r2_xdlops_nchw_kcyx_nkhw.hpp): An example of a specific grid-wise kernel that implements the implicit GEMM algorithm for the backward data pass. The library contains multiple such kernels optimized for different layouts and problem types, and the `DeviceConvNdBwdData` interface selects the most appropriate one.
- [`../../library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp`](../../library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp): A CPU reference implementation used to verify the correctness of the GPU kernel's output.
## Build and Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build the Example
```bash
cd /path/to/composable_kernel/example/17_convnd_bwd_data
mkdir build && cd build
cmake \
-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
..
make -j
```
### Run the Example
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -45,3 +101,16 @@ Warm up
Start running 1 times...
Perf: 1.40031 ms, 69.8734 TFlops, 179.037 GB/s
```
## Relationship to Other Passes
The training of a single convolutional layer requires three distinct steps:
1. **Forward Pass (`conv_fwd`)**: Computes the output feature maps.
- `Out = In * W`
2. **Backward Data Pass (`conv_bwd_data`)**: Computes the gradient with respect to the input, propagating the error to the previous layer. This is the focus of the current example.
- `dL/dIn = dL/dOut * rot180(W)`
3. **Backward Weight Pass (`conv_bwd_weight`)**: Computes the gradient with respect to the weights, which is needed for the weight update.
- `dL/dW = In * dL/dOut`
All three passes are critical for training a CNN, and all are typically implemented as high-performance implicit GEMM operations.

View File

@@ -18,6 +18,10 @@
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
void print_helper_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"

View File

@@ -1,8 +1 @@
list(APPEND gpu_list gfx908 gfx90a gfx942 gfx950)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
set(target 1)
endif()
endforeach()
add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)

View File

@@ -0,0 +1,78 @@
# Batched GEMM with Reduction
This example demonstrates a Batched General Matrix-Matrix Multiplication (Batched GEMM) where the result of each individual GEMM in the batch is then reduced along one of its dimensions. This is a specialized fusion pattern that combines a compute-intensive operation (GEMM) with a memory-intensive one (reduction), offering significant performance benefits for specific workloads.
## Mathematical Formulation
The operation performs a standard GEMM for each item in a batch, and then reduces the resulting matrix to a vector. For each batch item `b` from `0` to `BatchCount-1`:
1. **GEMM Stage**: A standard matrix multiplication is performed.
$C_{[b]} = A_{[b]} \times B_{[b]}$
2. **Reduction Stage**: The resulting matrix $C_{[b]}$ is reduced along one of its dimensions (e.g., the M dimension) to produce an output vector $D_{[b]}$.
$D_{[b], j} = \bigoplus_{i=0}^{M-1} C_{[b], i, j}$
Where:
- $A_{[b]}$ is an $M \times K$ matrix.
- $B_{[b]}$ is a $K \times N$ matrix.
- $C_{[b]}$ is the intermediate $M \times N$ result matrix for batch `b`.
- $D_{[b]}$ is the final $1 \times N$ output vector for batch `b`.
- $\bigoplus$ is a binary, associative reduction operator like sum, max, or min.
The key optimization is that the intermediate matrix $C_{[b]}$ is never written to global memory. The reduction is fused directly into the GEMM kernel.
## Algorithmic Strategy: Fused GEMM and Reduction
The implementation fuses the reduction into the epilogue of a batched GEMM kernel. The batch dimension provides a natural axis for parallelism.
1. **Batch Scheduling**: The `BatchCount` GEMM problems are distributed across the GPU's thread blocks. Each block is assigned one or more GEMMs from the batch to compute.
2. **Tiled GEMM Core**: For each assigned GEMM, the thread block runs a standard tiled GEMM algorithm to compute the product $A_{[b]} \times B_{[b]}$. The result for each tile of $C_{[b]}$ is accumulated in the private registers of the threads.
3. **Fused Reduction Epilogue**: This is where the fusion occurs. Instead of writing the computed tile of $C_{[b]}$ to global memory, the threads use it as input for a parallel reduction.
- **Intra-Block Reduction**: The threads within a block, which collectively hold the values for a tile of $C_{[b]}$, perform a local reduction. For example, to reduce along the M dimension, threads responsible for different M-rows but the same N-column will cooperate, using fast shared memory to sum their partial results.
- **Inter-Block Reduction**: Since multiple thread blocks may be working on different M-tiles for the same batch item, their partial reduction results must be combined. Each block writes its partial sum to a designated location in the output vector `D`, using atomic operations (like `atomicAdd`) to safely accumulate the final result.
This strategy completely eliminates the global memory traffic associated with the intermediate matrix `C`, which is often the largest tensor in the operation. This leads to substantial savings in memory bandwidth and improved performance.
## Source Code Organization
- [`batched_gemm_reduce_xdl.cpp`](./batched_gemm_reduce_xdl.cpp): The main example file. It sets up the batched GEMM problem and instantiates the `DeviceBatchedGemmReduce` operation, specifying the reduction dimension and operator.
- [`../../include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce.hpp`](../../include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce.hpp): The high-level device interface for this fused operation.
- [`../../include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_reduce_xdl_cshuffle.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_reduce_xdl_cshuffle.hpp): The grid-wise kernel that implements the fused logic. It handles the batch scheduling, the tiled GEMM, and the fused reduction epilogue with atomic operations for inter-block communication.
## Build and Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build the Example
```bash
cd /path/to/composable_kernel/example/18_batched_gemm_reduce
mkdir build && cd build
cmake \
-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
..
make -j
```
### Run the Example
```bash
# Run the example with default settings
./batched_gemm_reduce_xdl
# Run with verification, data initialization, and timing
./batched_gemm_reduce_xdl 1 2 1
```
## Applications
This fused pattern is less common than simple GEMM+Bias but is highly effective for specific algorithms.
- **Gradient Computations**: In some complex neural network layers, the gradient calculation might involve a matrix product followed by a summation. For example, computing the gradient with respect to a bias term often involves summing the output gradients over the batch and spatial dimensions. If the output gradient itself is the result of a GEMM, this fused kernel could be applicable.
- **Custom Attention Mechanisms**: While standard attention involves a `softmax`, some research explores attention-like mechanisms that might use a simple sum or max reduction instead. If the query-key interaction is formulated as a batched GEMM, this kernel could compute the attention weights in a single, fused step.
- **Scientific Computing**: Certain numerical methods, particularly in physics or signal processing, may involve performing a linear transform (GEMM) on a set of signals (a batch) and then integrating the result (a reduction).

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -19,14 +19,19 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using ADataType = F16;
using BDataType = F16;
@@ -64,7 +69,7 @@ using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatc
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type Tuple| Elementwise| Elementwise| Elementwise| Reduce| | | MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| | | Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< Row, Col, Row, F16, F16, F16, F32, F32, F32, ReducePtrsGlobal, AElementOp, BElementOp, CElementOp, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, S<64, 4>, 4, 1>;
< Row, Col, Row, F16, F16, F16, F32, F32, F32, ReducePtrsGlobal, AElementOp, BElementOp, CElementOp, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4, S<32, 8>, 4, 1>;
// clang-format on
using ReferenceBatchedGemmInstance =
@@ -137,11 +142,13 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
return HostTensorDescriptor(
{batch_count, row, col}, {row * stride, stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
return HostTensorDescriptor(
{batch_count, row, col}, {col * stride, 1_uz, stride}, Bypass{});
}
};

View File

@@ -0,0 +1,84 @@
# Binary Elementwise Operations with Broadcasting
This example demonstrates a generic binary elementwise operation, a fundamental building block in numerical computing. It covers two important cases:
1. **Simple Elementwise**: Applying a binary function to two input tensors of the *same* shape.
2. **Elementwise with Broadcasting**: Applying a binary function to two input tensors of *different but compatible* shapes.
Broadcasting defines a set of rules for applying elementwise operations on tensors of different sizes, and it is a cornerstone of libraries like NumPy and TensorFlow.
## Mathematical Formulation
### Simple Elementwise
Given two input tensors, A and B, of the same rank and dimensions, and a binary operator $\odot$, the operation computes an output tensor C where each element is:
$C_{i,j,k,\dots} = A_{i,j,k,\dots} \odot B_{i,j,k,\dots}$
### Elementwise with Broadcasting
Broadcasting allows elementwise operations on tensors with different shapes, provided they are compatible. Two dimensions are compatible if they are equal, or if one of them is 1. The operation implicitly "stretches" or "duplicates" the tensor with the dimension of size 1 to match the other tensor's shape.
For example, adding a bias vector `B` of shape `(1, N)` to a matrix `A` of shape `(M, N)`:
$C_{i,j} = A_{i,j} + B_{0,j}$
Here, the single row of `B` is broadcast across all `M` rows of `A`. The output tensor `C` has the shape `(M, N)`.
Common binary elementwise operations include addition, subtraction, multiplication (Hadamard product), division, max, and min.
## Algorithmic Strategy: Grid-Stride Loop with Broadcasting
The implementation for both cases relies on the efficient **grid-stride loop**, which is adapted to handle broadcasting.
1. **Grid Partitioning**: The problem is mapped to a 1D grid of threads based on the number of elements in the **output** tensor.
2. **Grid-Stride Loop**: Each thread iterates through a subset of the output elements. For each output index, it must calculate the corresponding indices into the input tensors A and B.
3. **Broadcasting Logic**:
- The core of the broadcasting logic lies in the `get_broadcast_coord` function. If an input tensor's dimension is 1, the coordinate for that dimension is always set to 0, effectively reusing the same element across the broadcast dimension. If the dimension matches the output, the coordinate is passed through.
- This strategy ensures that memory accesses to the larger tensor remain coalesced, while accesses to the smaller, broadcasted tensor will naturally involve re-reading the same values, which is efficiently handled by the GPU's cache hierarchy.
Like the simple case, broadcasted elementwise operations are almost always memory-bandwidth-bound.
## Source Code Organization
This example contains multiple files to demonstrate different scenarios:
- [`binary_elementwise_xdl.cpp`](./binary_elementwise_xdl.cpp): Demonstrates the simple case where both input tensors have the same shape.
- [`broadcast_add_2d_amn_bn.cpp`](./broadcast_add_2d_amn_bn.cpp): A specific example of broadcasting, adding a tensor of shape `(B, N)` to a tensor of shape `(A, M, N)`.
- [`../../include/ck/tensor_operation/gpu/device/device_elementwise.hpp`](../../include/ck/tensor_operation/gpu/device/device_elementwise.hpp): The high-level device interface. It is generic enough to handle both simple and broadcasted operations by correctly interpreting the tensor descriptors, which contain shape and stride information.
- [`../../include/ck/tensor_operation/gpu/grid/gridwise_elementwise.hpp`](../../include/ck/tensor_operation/gpu/grid/gridwise_elementwise.hpp): The grid-wise kernel that implements the grid-stride loop. The tensor coordinate logic within this kernel correctly handles broadcasting based on the provided tensor descriptors.
- [`../../include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp`](../../include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp): Defines the various binary operator functors (like `Add`, `Multiply`, etc.).
## Build and Run
### Prerequisites
Please follow the instructions in the main [Build Guide](../../README.md#building-ck) section as a prerequisite to building and running this example.
### Build the Example
```bash
cd /path/to/composable_kernel/example/19_binary_elementwise
mkdir build && cd build
cmake \
-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-DCMAKE_PREFIX_PATH="/opt/rocm;${CK_INSTALL_PATH}" \
..
make -j
```
### Run the Example
```bash
# Run the simple elementwise example
./binary_elementwise_xdl 1 2 1
# Run the broadcasting example
./broadcast_add_2d_amn_bn 1 2 1
```
## Applications
Broadcasting is a powerful feature that makes code more concise and memory-efficient.
- **Adding Bias**: The most common use case in deep learning is adding a bias vector (shape `[N]`) to a matrix of activations (shape `[Batch, N]`).
- **Feature Scaling**: Multiplying a feature map (shape `[N, C, H, W]`) by a per-channel scaling factor (shape `[1, C, 1, 1]`).
- **Standardization**: In data preprocessing, subtracting the mean (a vector) and dividing by the standard deviation (another vector) from a data matrix.
- **Coordinate Grids**: Creating coordinate grids by adding a row vector `[0, 1, 2...]` to a column vector `[0, 1, 2...]^T`.

View File

@@ -14,6 +14,10 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
using ::ck::Tensor;
using F16 = ck::half_t;
using F32 = float;

Some files were not shown because too many files have changed in this diff Show More