Merge branch 'develop' into ck_tile/gemm_blockscale_eightwarps

This commit is contained in:
Yi DING
2026-01-28 13:22:29 +08:00
committed by GitHub
249 changed files with 18528 additions and 25736 deletions

View File

@@ -25,6 +25,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
## Composable Kernel 1.2.0 for ROCm 7.2.0
### Added
* Added tests for f8 x bf8 on CompV3, and f8 x bf8 with K_BlockSize 32 on CompV4
* Added CK-Tile dispatcher - a unified kernel dispatch, code generation and architecture-based kernel filtering system with with C++ and Python frontends starting with GEMM support.
* Added support for bf16 data type to grouped_gemm and grouped_gemm_preshuffle.
* Added Col-Col-Row-Col layout support for aquant mode in blockscale GEMM.

View File

@@ -259,6 +259,11 @@ if ((SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx
add_definitions(-DCK_USE_GFX94)
set(CK_USE_GFX94 "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx950" AND NOT FORCE_DISABLE_XDL)
message(STATUS "Enabling XDL FP8 gemms on gfx950")
add_definitions(-DCK_USE_GFX950)
set(CK_USE_GFX950 "ON")
endif()
# new macro CK_TILE_USE_WMMA in order to separately compile examples for MFMA/WMMA
set(CK_TILE_USE_WMMA 0)

101
Dockerfile.manylinux Normal file
View File

@@ -0,0 +1,101 @@
FROM ghcr.io/rocm/therock_build_manylinux_x86_64:latest
ARG DEBIAN_FRONTEND=noninteractive
ARG ROCMVERSION=7.2
ARG compiler_version=""
ARG compiler_commit=""
ARG CK_SCCACHE=""
ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
ENV DEBIAN_FRONTEND=noninteractive
USER root
# Add rocm repository
RUN dnf clean all && dnf update -y && dnf -v install wget gnupg2 curl -y
RUN wget https://repo.radeon.com/amdgpu-install/7.2/rhel/8.10/amdgpu-install-7.2.70200-1.el8.noarch.rpm && \
dnf install ./amdgpu-install-7.2.70200-1.el8.noarch.rpm -y && \
dnf update -y && \
dnf install python3-setuptools python3-wheel -y && \
dnf install rocm-dev -y
## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
ENV CK_SCCACHE=$CK_SCCACHE
RUN if [ "$CK_SCCACHE" != "" ]; then \
mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache; \
fi
# Install dependencies
RUN dnf update -y && DEBIAN_FRONTEND=noninteractive dnf install -y \
cmake \
clang-tools-extra \
gcc-c++ \
libstdc++ \
libstdc++-devel \
libstdc++-static \
git \
hip-rocclr \
jq \
mpich \
net-tools \
pkg-config \
redis \
sshpass \
stunnel \
vim \
nano \
zip \
openssh-server \
kmod && \
dnf clean all && \
rm -rf /var/lib/apt/lists/* && \
rm -rf amdgpu-install* && \
#Install latest ccache
git clone https://github.com/ccache/ccache.git && \
cd ccache && mkdir build && cd build && cmake .. && make install && \
#Install ClangBuildAnalyzer
git clone https://github.com/aras-p/ClangBuildAnalyzer.git && \
cd ClangBuildAnalyzer/ && \
make -f projects/make/Makefile && \
cd / && \
#Install latest cppcheck
git clone https://github.com/danmar/cppcheck.git && \
cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \
cd / && \
# Install packages for processing the performance results
pip3 install --break-system-packages --upgrade pytest pymysql pandas==2.2.3 sqlalchemy==2.0.3 setuptools-rust setuptools sshtunnel==0.4.0 && \
# Add render group
groupadd -f render && \
# Install the new rocm-cmake version
git clone -b master https://github.com/ROCm/rocm-cmake.git && \
cd rocm-cmake && mkdir build && cd build && \
cmake .. && cmake --build . && cmake --build . --target install
WORKDIR /
# Add alternative compilers, if necessary
ENV compiler_version=$compiler_version
ENV compiler_commit=$compiler_commit
RUN sh -c "echo compiler version = '$compiler_version'" && \
sh -c "echo compiler commit = '$compiler_commit'"
RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
cd llvm-project && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
make -j 8 ; \
else echo "using the release compiler"; \
fi
RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
make -j 8 ; \
else echo "using the release compiler"; \
fi

4
Jenkinsfile vendored
View File

@@ -39,10 +39,10 @@ def sendFailureNotifications() {
// Error patterns to scan build logs for specific failure types and send detailed notifications.
def failurePatterns = [
[pattern: /login attempt to .* failed with status: 401 Unauthorized/, description: "Docker registry authentication failed"],
[pattern: /docker login failed/, description: "Docker login failed"],
[pattern: /(.*)docker login failed(.*)/, description: "Docker login failed"],
[pattern: /HTTP request sent .* 404 Not Found/, description: "HTTP request failed with 404"],
[pattern: /cat: .* No such file or directory/, description: "GPU not found"],
[pattern: /GPU not found/, description: "GPU not found"],
[pattern: /(.*)GPU not found(.*)/, description: "GPU not found"],
[pattern: /Could not connect to Redis at .* Connection timed out/, description: "Redis connection timed out"]
]

View File

@@ -1,14 +1,13 @@
.. _ck_tile_index:
************************
CK Tile Index
************************
CK Tile documentation structure:
****************************************************
CK Tile conceptual documentation table of contents
****************************************************
.. toctree::
:maxdepth: 2
index
introduction_motivation
buffer_views
tensor_views

View File

@@ -1,156 +0,0 @@
# Mermaid Diagram Management
This document explains how to manage mermaid diagrams in the CK Tile documentation.
## Overview
All mermaid diagrams in the CK Tile documentation have been converted to SVG files for better rendering compatibility. The original mermaid source code is preserved as commented blocks in the RST files, allowing easy updates when needed.
## Directory Structure
- `docs/conceptual/ck_tile/diagrams/` - Contains all SVG diagram files
- `docs/conceptual/ck_tile/convert_mermaid_to_svg.py` - Initial conversion script (one-time use)
- `docs/conceptual/ck_tile/update_diagrams.py` - Helper script to regenerate diagrams from comments
## Diagram Format in RST Files
Each diagram follows this format:
```rst
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
A --> B
B --> C
.. image:: diagrams/diagram_name.svg
:alt: Diagram
:align: center
```
The commented mermaid block won't appear in the rendered documentation but serves as the source for regenerating the SVG.
## Updating Diagrams
### When to Update
You need to regenerate SVG files when:
- Modifying the mermaid source in a commented block
- Adding new diagrams
- Updating diagram styling
### How to Update
1. **Edit the commented mermaid source** in the RST file
2. **Run the update script**:
```bash
# Update all diagrams
python docs/conceptual/ck_tile/update_diagrams.py
# Update diagrams in a specific file
python docs/conceptual/ck_tile/update_diagrams.py transforms.rst
# Force regenerate all diagrams (even if SVGs exist)
python docs/conceptual/ck_tile/update_diagrams.py --force
```
### Prerequisites
The update script requires [mermaid-cli](https://github.com/mermaid-js/mermaid-cli):
```bash
npm install -g @mermaid-js/mermaid-cli
```
## Adding New Diagrams
To add a new mermaid diagram:
1. **Create the commented block** in your RST file:
```rst
..
Original mermaid diagram (edit here, then run update_diagrams.py)
.. mermaid::
graph TB
A --> B
```
2. **Add the image reference** immediately after:
```rst
.. image:: diagrams/my_new_diagram.svg
:alt: My New Diagram
:align: center
```
3. **Generate the SVG**:
```bash
python docs/conceptual/ck_tile/update_diagrams.py your_file.rst
```
## Current Diagrams
The following RST files contain mermaid diagrams (40 total):
- `adaptors.rst` (2 diagrams)
- `convolution_example.rst` (1 diagram)
- `coordinate_movement.rst` (1 diagram)
- `descriptors.rst` (2 diagrams)
- `encoding_internals.rst` (2 diagrams)
- `lds_index_swapping.rst` (3 diagrams)
- `load_store_traits.rst` (2 diagrams)
- `space_filling_curve.rst` (1 diagram)
- `static_distributed_tensor.rst` (1 diagram)
- `sweep_tile.rst` (4 diagrams)
- `tensor_coordinates.rst` (2 diagrams)
- `thread_mapping.rst` (2 diagrams)
- `tile_window.rst` (5 diagrams)
- `transforms.rst` (12 diagrams)
## Troubleshooting
### SVG not generated
- Check that mermaid-cli is installed: `mmdc --version`
- Verify the mermaid syntax is valid
- Look for error messages in the script output
### Diagram not updating
- Use `--force` flag to regenerate: `python docs/update_diagrams.py --force`
- Check that the image reference matches the generated filename
### Pattern not matching
If the update script can't find your commented diagram:
- Ensure proper indentation (3 spaces for comment block content)
- Verify the `.. mermaid::` directive is commented
- Check that the image reference immediately follows the comment block
## Script Details
### update_diagrams.py
This script:
1. Scans RST files for commented mermaid blocks
2. Extracts the mermaid source code
3. Converts to SVG using `mmdc`
4. Saves to the diagrams directory
**Usage:**
- `python docs/conceptual/ck_tile/update_diagrams.py` - Check all files, update missing SVGs
- `python docs/conceptual/ck_tile/update_diagrams.py --force` - Regenerate all SVGs
- `python docs/conceptual/ck_tile/update_diagrams.py <file.rst>` - Update specific file
### convert_mermaid_to_svg.py
This was the initial conversion script. It:
1. Found all active `.. mermaid::` directives
2. Converted them to SVGs
3. Replaced directives with commented source + image references
This script was used once for the initial conversion and typically doesn't need to be run again.

View File

@@ -59,8 +59,8 @@ A TensorAdaptor encapsulates a sequence of :ref:`coordinate transformations <ck_
.. image:: diagrams/adaptors_1.svg
:alt: Diagram
:align: center
Core Components
Core Components
~~~~~~~~~~~~~~~
Each TensorAdaptor contains:
@@ -115,7 +115,7 @@ Custom adaptors can be created by specifying which transforms to use and how the
make_tuple(sequence<0>{}) // to single dim 0
);
// The adaptor is embedded in the :ref:`descriptor <ck_tile_descriptors>`
// The adaptor is embedded in the descriptor
// To use it:
multi_index<1> top_coord{5}; // 1D coordinate
// This internally calculates: row = 5/3 = 1, col = 5%3 = 2
@@ -309,7 +309,6 @@ A practical example showing how adaptors create efficient :ref:`GPU memory acces
// - Dimension 0,1: Thread indices
// - Dimension 2,3: Vector indices within thread
// Enables coalesced memory access on GPU
// See :ref:`ck_tile_thread_mapping` for thread mapping details
Common Transform Chains
-----------------------

View File

@@ -1,6 +1,25 @@
.. _ck_tile_buffer_views:
**********************************
Buffer Views - Raw Memory Access
**********************************
Overview
--------
At the foundation of the CK Tile system lies BufferView, a compile-time abstraction that provides structured access to raw memory regions within GPU kernels. This serves as the bridge between the hardware's physical memory model and the higher-level abstractions that enable efficient GPU programming. BufferView encapsulates the complexity of GPU memory hierarchies while exposing a unified interface that works seamlessly across different memory address spaces including global memory shared across the entire device, local data share (LDS) memory shared within a workgroup, or the ultra-fast register files private to each thread.
BufferView serves as the foundation for :ref:`ck_tile_tensor_views`, which add multi-dimensional structure on top of raw memory access. Understanding BufferView is essential before moving on to more complex abstractions like :ref:`ck_tile_distribution` and :ref:`ck_tile_tile_window`.
By providing compile-time knowledge of buffer properties through template metaprogramming, BufferView enables the compiler to generate optimal machine code for each specific use case. This zero-overhead abstraction ensures that the convenience of a high-level interface comes with no runtime performance penalty.
One of BufferView's most important features is its advanced handling of out-of-bounds memory access. Unlike CPU programming where such accesses typically result in segmentation faults or undefined behavior, GPU programming must gracefully handle cases where threads attempt to access memory beyond allocated boundaries. BufferView provides configurable strategies for these scenarios, where developers can choose between returning either numerical zero values or custom sentinel values for invalid accesses. This flexibility is important for algorithms that naturally extend beyond data boundaries, such as convolutions with padding or matrix operations with non-aligned dimensions.
The abstraction extends beyond simple memory access to encompass both scalar and vector data types. GPUs achieve their highest efficiency when loading or storing multiple data elements in a single instruction. BufferView seamlessly supports these vectorized operations, automatically selecting the appropriate hardware instructions based on the data type and access pattern. This capability transforms what would be multiple memory transactions into single, efficient operations that fully utilize the available memory bandwidth.
BufferView also incorporates AMD GPU-specific optimizations that leverage unique hardware features. The AMD buffer addressing mode, for instance, provides hardware-accelerated bounds checking that ensures memory safety without the performance overhead of software-based checks. Similarly, BufferView exposes atomic operations that are crucial for parallel algorithms requiring thread-safe updates to shared data structures. These hardware-specific optimizations are abstracted behind a portable interface, ensuring that code remains maintainable while achieving optimal performance.
Memory coherence and caching policies represent another layer of complexity that BufferView manages transparently. Different GPU memory spaces have different coherence guarantees and caching behaviors. Global memory accesses can be cached in L1 and L2 caches with various coherence protocols, while LDS memory provides workgroup-level coherence with specialized banking structures (see :ref:`ck_tile_lds_bank_conflicts` for details on avoiding bank conflicts). BufferView encapsulates these details, automatically applying the appropriate memory ordering constraints and cache control directives based on the target address space and operation type.
Address Space Usage Patterns
----------------------------
@@ -51,6 +70,7 @@ Address Space Usage Patterns
.. image:: diagrams/buffer_views_1.svg
:alt: Diagram
:align: center
C++ Implementation
------------------

View File

@@ -59,10 +59,6 @@ The key insight is that convolution can be transformed from a complex nested loo
.. image:: diagrams/convolution_example.svg
:alt: Diagram
:align: center
.. image:: diagrams/convolution_example.svg
:alt: Diagram
:align: center
@@ -88,7 +84,6 @@ Non-overlapping tiles:
// Original matrix: shape=(6, 6), strides=(6, 1)
// Tiled view: shape=(3, 3, 2, 2), strides=(12, 2, 6, 1)
// See :ref:`ck_tile_descriptors` for descriptor details
using TileDescriptor = TensorDescriptor<
Sequence<kNumTiles, kNumTiles, kTileSize, kTileSize>,
Sequence<12, 2, 6, 1>
@@ -243,7 +238,6 @@ The im2col transformation converts the 4D windows tensor into a 2D matrix suitab
>;
// Step 2: Apply merge transforms to create 2D im2col layout
// See :ref:`ck_tile_transforms` for transform operations
using Im2colDescriptor = decltype(
transform_tensor_descriptor(
WindowsDescriptor{},
@@ -312,7 +306,6 @@ Combining all components into an optimized convolution implementation:
>;
// Tile distribution for matrix multiplication
// See :ref:`ck_tile_tile_distribution` for details
using ATileDist = TileDistribution<
Sequence<TileM, TileK>,
Sequence<BlockM, 1>
@@ -327,7 +320,6 @@ Combining all components into an optimized convolution implementation:
>;
// Thread-local accumulator
// See :ref:`ck_tile_static_distributed_tensor`
StaticDistributedTensor<DataType, CTileDist> c_accumulator;
// Initialize accumulator
@@ -339,7 +331,6 @@ Combining all components into an optimized convolution implementation:
// Main GEMM loop over K dimension
for (index_t k_tile = 0; k_tile < PatchSize; k_tile += TileK) {
// Create tile windows for im2col matrix and kernel
// See :ref:`ck_tile_tile_window` for window operations
auto a_window = make_tile_window<ATileDist>(
input, Im2colDesc{H, W, K},
{blockIdx.y * TileM, k_tile}
@@ -350,7 +341,7 @@ Combining all components into an optimized convolution implementation:
{k_tile, 0}
);
// Load tiles - see :ref:`ck_tile_load_store_traits` for optimization
// Load tiles
auto a_tile = a_window.load();
auto b_tile = b_window.load();
@@ -476,7 +467,6 @@ CK Tile enables several optimizations for convolution:
__shared__ float smem_b[TileK][TileN];
// Collaborative loading with proper bank conflict avoidance
// See :ref:`ck_tile_lds_bank_conflicts` for optimization
auto load_tile_to_smem = [&](auto& window, float smem[][TileK]) {
#pragma unroll
for (index_t i = threadIdx.y; i < TileM; i += blockDim.y) {
@@ -560,7 +550,7 @@ This example demonstrates how CK Tile transforms convolution from a memory-bound
- **Sliding windows** can be efficiently represented using tensor descriptors with appropriate strides
- **Im2col transformation** converts convolution to matrix multiplication without data copies
- **Tile distribution** enables optimal work distribution across GPU threads (see :ref:`ck_tile_tile_distribution`)
- **Tile distribution** enables optimal work distribution across GPU threads (see :ref:`ck_tile_distribution`)
- **Multi-channel support** extends naturally through higher-dimensional descriptors
- **Performance optimizations** like vectorization and shared memory are seamlessly integrated (see :ref:`ck_tile_gemm_optimization` for similar techniques)

View File

@@ -317,7 +317,7 @@ Movement Through Adaptors
Advanced Movement Patterns
==========================
Real-world applications use advanced movement patterns for optimal memory access. These patterns often relate to :ref:`ck_tile_tile_window` operations and :ref:`ck_tile_tile_distribution` concepts:
Real-world applications use advanced movement patterns for optimal memory access. These patterns often relate to :ref:`ck_tile_tile_window` operations and :ref:`ck_tile_distribution` concepts:
Tiled Access Pattern
--------------------

View File

@@ -315,18 +315,18 @@ Padding for Convolution
.. code-block:: cpp
// Add padding to spatial dimensions
auto padded = transform_tensor_descriptor(
input_tensor,
make_tuple(
make_pass_through_transform(N), // Batch
make_pass_through_transform(C), // Channel
make_pad_transform(H, pad_h, pad_h), // Height
make_pad_transform(W, pad_w, pad_w) // Width
),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{})
);
// Add padding to spatial dimensions
auto padded = transform_tensor_descriptor(
input_tensor,
make_tuple(
make_pass_through_transform(N), // Batch
make_pass_through_transform(C), // Channel
make_pad_transform(H, pad_h, pad_h), // Height
make_pad_transform(W, pad_w, pad_w) // Width
),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{})
);
For a complete convolution example, see :ref:`ck_tile_convolution_example`.

View File

@@ -260,7 +260,6 @@ Here's how CK Tile implements an optimized GEMM kernel:
index_t K)
{
// Define tile distribution encoding
// See :ref:`ck_tile_encoding_internals` and :ref:`ck_tile_tile_distribution`
using Encoding = tile_distribution_encoding<
sequence<>, // No replication
tuple<sequence<4, 2, 8, 4>, // M dimension hierarchy
@@ -274,7 +273,6 @@ Here's how CK Tile implements an optimized GEMM kernel:
constexpr auto tile_dist = make_static_tile_distribution(Encoding{});
// Create tensor views for global memory
// See :ref:`ck_tile_tensor_views` and :ref:`ck_tile_buffer_views`
auto a_global_view = make_naive_tensor_view<address_space_enum::global>(
a_global, make_tuple(M, K), make_tuple(K, 1));
auto b_global_view = make_naive_tensor_view<address_space_enum::global>(
@@ -287,7 +285,6 @@ Here's how CK Tile implements an optimized GEMM kernel:
const index_t block_n_id = blockIdx.x;
// Create tile windows for loading
// See :ref:`ck_tile_tile_window` for tile window details
auto a_window = make_tile_window(
a_global_view,
make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
@@ -301,7 +298,6 @@ Here's how CK Tile implements an optimized GEMM kernel:
tile_dist);
// Allocate LDS storage
// See :ref:`ck_tile_static_distributed_tensor` for distributed tensors
auto a_lds = make_static_distributed_tensor<ADataType,
decltype(tile_dist)>();
auto b_lds = make_static_distributed_tensor<BDataType,
@@ -310,7 +306,6 @@ Here's how CK Tile implements an optimized GEMM kernel:
// Initialize accumulator
auto c_reg = make_static_distributed_tensor<CDataType,
decltype(tile_dist)>();
// See :ref:`ck_tile_sweep_tile` for sweep operations
sweep_tile(c_reg, [](auto idx, auto& val) { val = 0; });
// Main GEMM loop with pipelining
@@ -324,7 +319,6 @@ Here's how CK Tile implements an optimized GEMM kernel:
// Pipeline loop
for(index_t k_tile = 0; k_tile < num_k_tiles - 1; ++k_tile) {
// Move windows for next iteration
// See :ref:`ck_tile_coordinate_movement` for window movement
a_window.move_slice_window(make_tuple(0, KPerBlock));
b_window.move_slice_window(make_tuple(0, KPerBlock));

View File

@@ -172,7 +172,6 @@ Example usage in CK Tile:
a_window.load(a_lds_tensor);
// Subsequent reads from LDS are conflict-free
// See :ref:`ck_tile_sweep_tile` for sweep operations
sweep_tile(a_lds_tensor, [](auto idx, auto& val) {
// Process data...
});

View File

@@ -276,7 +276,7 @@ The foundation of the exploration begins with raw memory access through :ref:`ck
With these foundational concepts established, the documentation delves into the :ref:`ck_tile_coordinate_systems` that powers tile distribution. This engine implements the mathematical framework that have been introduced, providing compile-time transformations between P-space, Y-space, X-space, and D-space. Understanding these transformations at a deep level enables developers to reason about performance implications and design custom distribution strategies for novel algorithms. The :ref:`ck_tile_transforms` and :ref:`ck_tile_adaptors` provide the building blocks for these transformations.
The high-level :ref:`ck_tile_distribution` APIs represent the culmination of these lower-level abstractions. These APIs provide an accessible interface for common patterns while exposing enough flexibility for advanced optimizations. Through concrete examples and detailed explanations, the documentation will demonstrate how to leverage these APIs to achieve near-optimal performance across a variety of computational patterns. The :ref:`ck_tile_window` abstraction provides the gateway for efficient data access.
The high-level :ref:`ck_tile_distribution` APIs represent the culmination of these lower-level abstractions. These APIs provide an accessible interface for common patterns while exposing enough flexibility for advanced optimizations. Through concrete examples and detailed explanations, the documentation will demonstrate how to leverage these APIs to achieve near-optimal performance across a variety of computational patterns. The :ref:`ck_tile_tile_window` abstraction provides the gateway for efficient data access.
The exploration of coordinate systems goes beyond the basic P, Y, X, D framework to encompass advanced topics such as multi-level tiling, replication strategies, and specialized coordinate systems for specific algorithm classes. The :ref:`ck_tile_encoding_internals` reveals the mathematical foundations, while :ref:`ck_tile_thread_mapping` shows how these abstractions map to hardware. This comprehensive treatment ensures that developers can handle not just common cases but also novel algorithms that require custom distribution strategies.

View File

@@ -5,7 +5,7 @@
.. _ck_tile_lds_index_swapping:
********************************
Load Datat Share Index Swapping
Load Data Share Index Swapping
********************************
Overview
@@ -70,9 +70,9 @@ The original K coordinate is split into K0 and K1, where K1 represents the threa
The XOR transformation updates the K0 coordinate using the formula:
.. code-block:: cpp
.. math::
K0' = K0 ^ (M % (KPerBlock / KPack * MLdsLayer))
K0' = K0^{(M \% (KPerBlock / KPack * MLdsLayer))}
This XOR operation redistributes accesses across memory banks by mixing bits from the M and K dimensions.
@@ -132,10 +132,10 @@ The transformed K0' is split into L and K0'' components, creating an intermediat
The unmerge operation:
.. code-block:: cpp
.. math::
L = K0' / (KPerBlock/KPack)
K0'' = K0' % (KPerBlock/KPack)
K0'' = K0' \% (KPerBlock/KPack)
When MLdsLayer == 1, this simplifies to L=0 and K0''=K0'.

View File

@@ -71,7 +71,6 @@ The LoadStoreTraits class analyzes distribution patterns at compile time:
static constexpr index_t scalars_per_access = scalar_per_vector;
// Space-filling curve for optimal traversal
// See :ref:`ck_tile_space_filling_curve` for details
using sfc_type = space_filling_curve<ndim_y>;
static constexpr sfc_type sfc_ys = make_space_filling_curve<Distribution>();
@@ -274,7 +273,7 @@ LoadStoreTraits optimizes for several performance metrics:
return Traits::num_access;
}
// Check coalescing efficiency (see :ref:`ck_tile_gpu_basics`)
// Check coalescing efficiency
static constexpr bool is_perfectly_coalesced()
{
// Perfect coalescing when adjacent threads access adjacent memory
@@ -316,7 +315,6 @@ Comparing Different Configurations
static_assert(OptimizedAnalyzer::bandwidth_utilization() == 50.0f); // 8*4/64
// Better bandwidth utilization leads to improved performance
// See :ref:`ck_tile_gemm_optimization` for real-world examples
Integration with Space-Filling Curves
-------------------------------------

View File

@@ -254,7 +254,6 @@ For :ref:`matrix multiplication <ck_tile_gemm_optimization>`, optimal access pat
// GEMM tile: 16x32 with vector-8 loads
// Column-major for coalesced access in GEMM
// See :ref:`ck_tile_gemm_optimization` for complete example
using GemmTileCurve = space_filling_curve<
2,
sequence<16, 32>, // Tile size
@@ -336,7 +335,7 @@ Optimizing for Hardware
.. code-block:: cpp
// Optimize for GPU memory coalescing (see :ref:`ck_tile_gpu_basics`)
// Optimize for GPU memory coalescing
template <typename DataType, index_t WarpSize = 32>
struct coalesced_access_pattern
{
@@ -411,7 +410,6 @@ LoadStoreTraits Integration
struct load_store_traits
{
// Create optimized space-filling curve
// See :ref:`ck_tile_tile_distribution` for Distribution details
using sfc_type = space_filling_curve<
Distribution::ndim_y,
typename Distribution::y_lengths,
@@ -461,7 +459,6 @@ Best Practices
.. code-block:: cpp
// Match vector size to cache line for optimal bandwidth
// See :ref:`ck_tile_lds_bank_conflicts` for cache optimization
constexpr index_t optimal_vector = min(
tensor_length_fast_dim,
cache_line_size / sizeof(DataType)

View File

@@ -17,9 +17,9 @@ Each thread in a workgroup owns a portion of the overall tensor data, stored in
This design enables three critical optimizations:
* It maximizes register utilization by keeping frequently accessed data in the fastest memory hierarchy.
* It eliminates redundant memory accesses since each thread maintains its own working set.
* It provides a clean abstraction for complex algorithms like matrix multiplication where each thread accumulates partial results that eventually combine into the final output.
* It maximizes register utilization by keeping frequently accessed data in the fastest memory hierarchy.
* It eliminates redundant memory accesses since each thread maintains its own working set.
* It provides a clean abstraction for complex algorithms like matrix multiplication where each thread accumulates partial results that eventually combine into the final output.
Thread-Local Storage Model
==========================
@@ -384,8 +384,7 @@ Static distributed tensors integrate seamlessly with other CK Tile components:
// Main GEMM loop
for(index_t k_tile = 0; k_tile < K; k_tile += kTileK) {
// Create tile windows for this iteration
// See :ref:`ck_tile_tile_window` for details
auto a_window = make_tile_window(
auto a_window = make_tile_window(
a_ptr, ALayout{M, K},
ATileDist{},
{blockIdx.y * kTileM, k_tile}
@@ -398,7 +397,6 @@ Static distributed tensors integrate seamlessly with other CK Tile components:
);
// Load tiles to distributed tensors
// See :ref:`ck_tile_load_store_traits` for optimized loading
auto a_tile = a_window.load();
auto b_tile = b_window.load();

View File

@@ -356,7 +356,6 @@ CK uses several techniques to optimize memory access:
float>>>;
// 2. Swizzling to avoid bank conflicts
// See :ref:`ck_tile_lds_index_swapping` and :ref:`ck_tile_swizzling_example`
template <index_t BankSize = 32>
__device__ index_t swizzle_offset(index_t tid, index_t offset)
{
@@ -434,7 +433,6 @@ The following example shows how thread mapping works in a CK kernel:
__shared__ ComputeType shared_sum[BlockSize];
// 5. Create tensor view and tile window
// See :ref:`ck_tile_tensor_views` and :ref:`ck_tile_tile_window`
auto x_view = make_naive_tensor_view<address_space_enum::global>(
x + bid * hidden_size,
make_tuple(hidden_size),

View File

@@ -1,4 +1,4 @@
.. _ck_tile_distribution:
.. _ck_tile_tile_distribution:
Tile Distribution - The Core API
================================

View File

@@ -283,7 +283,7 @@ Creating and Using TileWindow
using namespace ck_tile;
// Create a tensor view for input data (see :ref:`ck_tile_tensor_views`)
// Create a tensor view for input data
auto tensor_view = make_naive_tensor_view(
data_ptr,
make_tuple(256, 256), // Shape
@@ -314,7 +314,7 @@ Creating and Using TileWindow
distribution
);
// Load data into distributed tensor (see :ref:`ck_tile_static_distributed_tensor`)
// Load data into distributed tensor
auto distributed_data = make_static_distributed_tensor<float>(distribution);
window.load(distributed_data);
@@ -558,7 +558,6 @@ Complete Load-Compute-Store Pipeline
c_dist);
// Create distributed tensors for register storage
// See :ref:`ck_tile_static_distributed_tensor` for details
auto a_reg = make_static_distributed_tensor<AType>(a_dist);
auto b_reg = make_static_distributed_tensor<BType>(b_dist);
auto c_reg = make_static_distributed_tensor<CType>(c_dist);
@@ -620,6 +619,8 @@ Performance Characteristics
.. image:: diagrams/tile_window_5.svg
:alt: Diagram
:align: center
Best Practices
--------------

View File

@@ -302,7 +302,7 @@ EmbedTransform expands linear indices from the lower coordinate space into multi
using namespace ck_tile;
// Create embed transform for 2x3 tensor with strides [12, 1]
// This is commonly used in :ref:`descriptors <ck_tile_descriptors>`
// This is commonly used in descriptors
auto transform = make_embed_transform(make_tuple(2, 3), make_tuple(12, 1));
// Forward: Linear → 2D (Manual calculation)

View File

@@ -30,8 +30,6 @@ release = version_number
external_toc_path = "./sphinx/_toc.yml"
docs_core = ROCmDocs(left_nav_title)
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
docs_core.enable_api_reference()
docs_core.setup()
external_projects_current_project = "composable_kernel"
@@ -50,4 +48,4 @@ for sphinx_var in ROCmDocs.SPHINX_VARS:
extensions += ['sphinxcontrib.bibtex']
bibtex_bibfiles = ['refs.bib']
cpp_id_attributes = ["__global__", "__device__", "__host__"]
cpp_id_attributes = ["__global__", "__device__", "__host__"]

File diff suppressed because it is too large Load Diff

View File

@@ -25,7 +25,7 @@ The Composable Kernel repository is located at `https://github.com/ROCm/composab
* :doc:`Composable Kernel structure <./conceptual/Composable-Kernel-structure>`
* :doc:`Composable Kernel mathematical basis <./conceptual/Composable-Kernel-math>`
* :doc:`CK Tile conceptual documentation <./conceptual/ck_tile/index>`
* :doc:`CK Tile conceptual documentation <./conceptual/ck_tile/CK-tile-index>`
.. grid-item-card:: Tutorials
@@ -37,9 +37,6 @@ The Composable Kernel repository is located at `https://github.com/ROCm/composab
* :doc:`Composable Kernel custom types <./reference/Composable_Kernel_custom_types>`
* :doc:`Composable Kernel vector utilities <./reference/Composable_Kernel_vector_utilities>`
* :ref:`wrapper`
* :doc:`Composable Kernel API reference <./doxygen/html/namespace_c_k>`
* :doc:`CK Tile API reference <./doxygen/html/namespaceck__tile>`
* :doc:`Composable Kernel complete API class list <./doxygen/html/annotated>`
* :doc:`Composable Kernel glossary <./reference/Composable-Kernel-Glossary>`
To contribute to the documentation refer to `Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.

View File

@@ -4,7 +4,6 @@
***************************************************
Composable Kernel glossary
***************************************************
.. glossary::
@@ -14,7 +13,7 @@ Composable Kernel glossary
The arithmetic logic unit (ALU) is the GPU component responsible for arithmetic and logic operations.
compute unit
The compute unit (CU) is the parallel vector processor in an AMD GPU with multiple :term:`ALUs<arithmetic logic unit>`. Each compute unit will run all the :term:`wavefronts<wavefront>` in a :term:`work group>`. A compute unit is equivalent to NVIDIA's streaming multiprocessor.
The compute unit (CU) is the parallel vector processor in an AMD GPU with multiple :term:`ALUs<arithmetic logic unit>`. Each compute unit will run all the :term:`wavefronts<wavefront>` in a :term:`work group`. A compute unit is equivalent to NVIDIA's streaming multiprocessor.
matrix core
A matrix core is a specialized GPU unit that accelerate matrix operations for AI and deep learning tasks. A GPU contains multiple matrix cores.
@@ -32,7 +31,7 @@ Composable Kernel glossary
See :term:`scalar general purpose register`.
scalar general purpose register
A scalar general purpose register (SGPR) is a :term:`register` shared by all the :term:`work items<work item>` in a :term:`wave<wavefront>`. SGPRs are used for constants, addresses, and control flow common across the entire wave.
A scalar general purpose register (SGPR) is a :term:`register` shared by all the :term:`work-items<work-item>` in a :term:`wave<wavefront>`. SGPRs are used for constants, addresses, and control flow common across the entire wave.
LDS
See :term:`local data share`.
@@ -101,7 +100,7 @@ Composable Kernel glossary
A Composable Kernel pipeline schedules the sequence of operations for a :term:`kernel`, such as the data loading, computation, and storage phases. A pipeline consists of a :term:`problem` and a :term:`policy`.
tile partitioner
The tile partitioner defines the mapping between the :term:`problem` dimensions and GPU hierarchy. It specifies :term:`workgroup`-level :term:`tile` sizes and determines :term:`grid` dimensions by dividing the problem size by the tile sizes.
The tile partitioner defines the mapping between the :term:`problem` dimensions and GPU hierarchy. It specifies :term:`work group`-level :term:`tile` sizes and determines :term:`grid` dimensions by dividing the problem size by the tile sizes.
problem
The problem is the part of the :term:`pipeline` that defines input and output shapes, data types, and mathematical :term:`operations<operation>`.
@@ -186,10 +185,10 @@ Composable Kernel glossary
Viewport into a larger tensor that defines the current tile's position and boundaries for computation.
load tile
Load tile is an operation that transfers data from :term:`global memory` or the :term:`load data share` to :term:`vector general purpose registers<vector general purpose register>`.
Load tile is an operation that transfers data from :term:`global memory` or the :term:`local data share` to :term:`vector general purpose registers<vector general purpose register>`.
store tile
Store tile is an operation that transfers data from :term:`vector general purpose registers<vector general purpose register>` to :term:`global memory` or the :term:`load data share`.
Store tile is an operation that transfers data from :term:`vector general purpose registers<vector general purpose register>` to :term:`global memory` or the :term:`local data share`.
descriptor
Metadata structure that defines :term:`tile` properties, memory layouts, and coordinate transformations for Composable Kernel :term:`operations<operation>`.

View File

@@ -54,36 +54,3 @@ Advanced examples:
* `Image to column <https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_img2col.cpp>`_
* `Basic gemm <https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_basic_gemm.cpp>`_
* `Optimized gemm <https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp>`_
-------------------------------------
Layout
-------------------------------------
.. doxygenstruct:: Layout
-------------------------------------
Layout helpers
-------------------------------------
.. doxygenfile:: include/ck/wrapper/utils/layout_utils.hpp
-------------------------------------
Tensor
-------------------------------------
.. doxygenstruct:: Tensor
-------------------------------------
Tensor helpers
-------------------------------------
.. doxygenfile:: include/ck/wrapper/utils/tensor_utils.hpp
.. doxygenfile:: include/ck/wrapper/utils/tensor_partition.hpp
-------------------------------------
Operations
-------------------------------------
.. doxygenfile:: include/ck/wrapper/operations/copy.hpp
.. doxygenfile:: include/ck/wrapper/operations/gemm.hpp

View File

@@ -6,42 +6,36 @@ subtrees:
- caption: Install
entries:
- file: install/Composable-Kernel-prerequisites.rst
title: Composable Kernel prerequisites
title: Prerequisites
- file: install/Composable-Kernel-install.rst
title: Build and install Composable Kernel
- file: install/Composable-Kernel-Docker.rst
title: Composable Kernel Docker images
title: Docker images
- caption: Conceptual
entries:
- file: conceptual/Composable-Kernel-structure.rst
title: Composable Kernel structure
title: Structure
- file: conceptual/Composable-Kernel-math.rst
title: Composable Kernel mathematical basis
- file: conceptual/ck_tile/index.rst
title: Mathematical basis
- file: conceptual/ck_tile/CK-tile-index.rst
title: CK Tile conceptual documentation
- caption: Tutorial
entries:
- file: tutorial/Composable-Kernel-examples.rst
title: Composable Kernel examples
title: Examples
- caption: Reference
entries:
- file: reference/Composable_Kernel_supported_scalar_types.rst
title: Composable Kernel scalar types
title: Scalar types
- file: reference/Composable_Kernel_custom_types.rst
title: Composable Kernel custom types
title: Custom types
- file: reference/Composable_Kernel_vector_utilities.rst
title: Composable Kernel vector utilities
title: Vector utilities
- file: reference/Composable-Kernel-wrapper.rst
title: Composable Kernel wrapper
- file: doxygen/html/namespace_c_k.rst
title: CK API reference
- file: doxygen/html/namespaceck__tile.rst
title: CK Tile API reference
- file: doxygen/html/annotated.rst
title: Full API class list
title: Wrapper
- file: reference/Composable-Kernel-Glossary.rst
title: Glossary

View File

@@ -8,9 +8,9 @@ accessible-pygments==0.0.5
# via pydata-sphinx-theme
alabaster==1.0.0
# via sphinx
asttokens==3.0.0
asttokens==3.0.1
# via stack-data
attrs==25.3.0
attrs==25.4.0
# via
# jsonschema
# jupyter-cache
@@ -19,40 +19,30 @@ babel==2.17.0
# via
# pydata-sphinx-theme
# sphinx
beautifulsoup4==4.13.4
beautifulsoup4==4.14.3
# via pydata-sphinx-theme
breathe==4.36.0
# via rocm-docs-core
certifi==2025.1.31
certifi==2026.1.4
# via requests
cffi==1.17.1
cffi==2.0.0
# via
# cryptography
# pynacl
charset-normalizer==3.4.1
charset-normalizer==3.4.4
# via requests
click==8.1.8
click==8.3.1
# via
# click-log
# doxysphinx
# jupyter-cache
# sphinx-external-toc
click-log==0.4.0
# via doxysphinx
comm==0.2.2
comm==0.2.3
# via ipykernel
contourpy==1.3.2
# via matplotlib
cryptography==44.0.2
cryptography==46.0.3
# via pyjwt
cycler==0.12.1
# via matplotlib
debugpy==1.8.14
debugpy==1.8.19
# via ipykernel
decorator==5.2.1
# via ipython
deprecated==1.2.18
# via pygithub
docutils==0.21.2
# via
# myst-parser
@@ -60,35 +50,31 @@ docutils==0.21.2
# pydata-sphinx-theme
# sphinx
# sphinxcontrib-bibtex
doxysphinx==3.3.12
# via rocm-docs-core
exceptiongroup==1.2.2
exceptiongroup==1.3.1
# via ipython
executing==2.2.0
executing==2.2.1
# via stack-data
fastjsonschema==2.21.1
fastjsonschema==2.21.2
# via
# nbformat
# rocm-docs-core
fonttools==4.57.0
# via matplotlib
gitdb==4.0.12
# via gitpython
gitpython==3.1.44
gitpython==3.1.46
# via rocm-docs-core
greenlet==3.2.1
greenlet==3.3.0
# via sqlalchemy
idna==3.10
idna==3.11
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==8.6.1
importlib-metadata==8.7.1
# via
# jupyter-cache
# myst-nb
ipykernel==6.29.5
ipykernel==7.1.0
# via myst-nb
ipython==8.35.0
ipython==8.38.0
# via
# ipykernel
# myst-nb
@@ -98,53 +84,43 @@ jinja2==3.1.6
# via
# myst-parser
# sphinx
jsonschema==4.23.0
jsonschema==4.26.0
# via nbformat
jsonschema-specifications==2024.10.1
jsonschema-specifications==2025.9.1
# via jsonschema
jupyter-cache==1.0.1
# via myst-nb
jupyter-client==8.6.3
jupyter-client==8.8.0
# via
# ipykernel
# nbclient
jupyter-core==5.7.2
jupyter-core==5.9.1
# via
# ipykernel
# jupyter-client
# nbclient
# nbformat
kiwisolver==1.4.8
# via matplotlib
latexcodec==3.0.0
latexcodec==3.0.1
# via pybtex
libsass==0.22.0
# via doxysphinx
lxml==5.2.1
# via doxysphinx
markdown-it-py==3.0.0
# via
# mdit-py-plugins
# myst-parser
markupsafe==3.0.2
markupsafe==3.0.3
# via jinja2
matplotlib==3.10.1
# via doxysphinx
matplotlib-inline==0.1.7
matplotlib-inline==0.2.1
# via
# ipykernel
# ipython
mdit-py-plugins==0.4.2
mdit-py-plugins==0.5.0
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
mpire==2.10.2
# via doxysphinx
myst-nb==1.2.0
myst-nb==1.3.0
# via rocm-docs-core
myst-parser==4.0.1
# via myst-nb
nbclient==0.10.2
nbclient==0.10.4
# via
# jupyter-cache
# myst-nb
@@ -155,28 +131,20 @@ nbformat==5.10.4
# nbclient
nest-asyncio==1.6.0
# via ipykernel
numpy==1.26.4
# via
# contourpy
# doxysphinx
# matplotlib
packaging==25.0
# via
# ipykernel
# matplotlib
# pydata-sphinx-theme
# sphinx
parso==0.8.4
parso==0.8.5
# via jedi
pexpect==4.9.0
# via ipython
pillow==11.2.1
# via matplotlib
platformdirs==4.3.7
platformdirs==4.5.1
# via jupyter-core
prompt-toolkit==3.0.51
prompt-toolkit==3.0.52
# via ipython
psutil==7.0.0
psutil==7.2.1
# via ipykernel
ptyprocess==0.7.0
# via pexpect
@@ -188,36 +156,27 @@ pybtex==0.25.1
# sphinxcontrib-bibtex
pybtex-docutils==1.0.3
# via sphinxcontrib-bibtex
pycparser==2.22
pycparser==2.23
# via cffi
pydata-sphinx-theme==0.15.4
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==2.6.1
pygithub==2.8.1
# via rocm-docs-core
pygments==2.19.1
pygments==2.19.2
# via
# accessible-pygments
# ipython
# mpire
# pydata-sphinx-theme
# sphinx
pyjson5==1.6.8
# via doxysphinx
pyjwt[crypto]==2.10.1
# via pygithub
pynacl==1.5.0
pynacl==1.6.2
# via pygithub
pyparsing==3.2.3
# via
# doxysphinx
# matplotlib
python-dateutil==2.9.0.post0
# via
# jupyter-client
# matplotlib
pyyaml==6.0.2
# via jupyter-client
pyyaml==6.0.3
# via
# jupyter-cache
# myst-nb
@@ -225,21 +184,21 @@ pyyaml==6.0.2
# pybtex
# rocm-docs-core
# sphinx-external-toc
pyzmq==26.4.0
pyzmq==27.1.0
# via
# ipykernel
# jupyter-client
referencing==0.36.2
referencing==0.37.0
# via
# jsonschema
# jsonschema-specifications
requests==2.32.3
requests==2.32.5
# via
# pygithub
# sphinx
rocm-docs-core[api-reference]==1.31.3
# via -r requirements.in
rpds-py==0.24.0
rpds-py==0.30.0
# via
# jsonschema
# referencing
@@ -247,9 +206,9 @@ six==1.17.0
# via python-dateutil
smmap==5.0.2
# via gitdb
snowballstemmer==2.2.0
snowballstemmer==3.0.1
# via sphinx
soupsieve==2.7
soupsieve==2.8.1
# via beautifulsoup4
sphinx==8.1.3
# via
@@ -288,23 +247,20 @@ sphinxcontrib-qthelp==2.0.0
# via sphinx
sphinxcontrib-serializinghtml==2.0.0
# via sphinx
sqlalchemy==2.0.40
sqlalchemy==2.0.45
# via jupyter-cache
stack-data==0.6.3
# via ipython
tabulate==0.9.0
# via jupyter-cache
tomli==2.2.1
tomli==2.4.0
# via sphinx
tornado==6.4.2
tornado==6.5.4
# via
# ipykernel
# jupyter-client
tqdm==4.67.1
# via mpire
traitlets==5.14.3
# via
# comm
# ipykernel
# ipython
# jupyter-client
@@ -312,22 +268,22 @@ traitlets==5.14.3
# matplotlib-inline
# nbclient
# nbformat
typing-extensions==4.13.2
typing-extensions==4.15.0
# via
# beautifulsoup4
# cryptography
# exceptiongroup
# ipython
# myst-nb
# pydata-sphinx-theme
# pygithub
# referencing
# sqlalchemy
urllib3==2.4.0
urllib3==2.6.3
# via
# pygithub
# requests
wcwidth==0.2.13
wcwidth==0.2.14
# via prompt-toolkit
wrapt==1.17.2
# via deprecated
zipp==3.21.0
zipp==3.23.0
# via importlib-metadata

View File

@@ -19,22 +19,22 @@ using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
// clang-format off
using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
PassThrough, PassThrough, PassThrough, GemmDefault,
PassThrough, PassThrough, PassThrough, GemmSpec,
256,
128, 256, 64,
8, 8,
16, 16,
2, 8,
S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 8, 8, 1,
S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 1, 8, 1,
S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 1, 8, 1,
1, 8, 8, 1,
1, 1,
S<1, 64, 1, 4>, 8,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1>;

View File

@@ -4,7 +4,7 @@
#include "run_gemm_quant_example.inc"
template <typename T>
using GemmConfig = GemmConfigQuantDecode<T>;
using GemmConfig = GemmConfigQuantDecodeInterwave<T>;
// GemmConfigQuantPrefill is also supported for aquant grouped quantization
// template <typename T>

View File

@@ -93,6 +93,27 @@ struct GemmConfigQuantDecode : public GemmConfigBase
static constexpr ck_tile::index_t N_Warp_Tile = 16;
static constexpr ck_tile::index_t K_Warp_Tile =
ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
// static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Interwave;
};
template <typename PrecType>
struct GemmConfigQuantDecodeInterwave : public GemmConfigBase
{
static constexpr ck_tile::index_t M_Tile = 16;
static constexpr ck_tile::index_t N_Tile = 64;
static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
static constexpr ck_tile::index_t M_Warp = 1;
static constexpr ck_tile::index_t N_Warp = 4;
static constexpr ck_tile::index_t K_Warp = 1;
static constexpr ck_tile::index_t M_Warp_Tile = 16;
static constexpr ck_tile::index_t N_Warp_Tile = 16;
static constexpr ck_tile::index_t K_Warp_Tile =
ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Interwave;
};
template <typename PrecType>
@@ -229,6 +250,8 @@ struct GemmConfigQuantPrefill : public GemmConfigBase
static constexpr ck_tile::index_t N_Warp_Tile = 16;
static constexpr ck_tile::index_t K_Warp_Tile =
ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
// static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Interwave;
};
template <typename PrecType>

View File

@@ -660,7 +660,7 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser,
else
{
ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x22)}(a_m_k);
ck_tile::FillConstant<AQDataType>{static_cast<AQDataType>(0.5f)}(*aq_tensor_ptr);
ck_tile::FillConstant<AQDataType>{static_cast<AQDataType>(1.0f)}(*aq_tensor_ptr);
ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x38)}(b_k_n);
if constexpr(QuantMode == ck_tile::QuantType::RowColQuant)
@@ -669,6 +669,184 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser,
}
}
}
else if(init_method == 3)
{
if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
{
ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x38)}(a_m_k);
ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x22)}(b_k_n);
ck_tile::FillConstant<BQDataType>{static_cast<BQDataType>(0.5f)}(*bq_tensor_ptr);
}
else if constexpr(QuantMode == ck_tile::QuantType::ABQuantGrouped)
{
ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x38)}(a_m_k);
ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x22)}(b_k_n);
ck_tile::FillConstant<AQDataType>{static_cast<AQDataType>(0.5f)}(*aq_tensor_ptr);
ck_tile::FillConstant<BQDataType>{static_cast<BQDataType>(0.5f)}(*bq_tensor_ptr);
}
else
{
ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x22)}(a_m_k);
ck_tile::FillConstant<AQDataType>{static_cast<AQDataType>(2.0f)}(*aq_tensor_ptr);
ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x38)}(b_k_n);
if constexpr(QuantMode == ck_tile::QuantType::RowColQuant)
{
ck_tile::FillConstant<BQDataType>{static_cast<BQDataType>(0.5f)}(*bq_tensor_ptr);
}
}
}
else if(init_method == 4)
{
if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
{
if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
{
ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
b_k_n);
ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*bq_tensor_ptr);
}
else if constexpr(std::is_same_v<BDataType, ck_tile::pk_fp4_raw_t>)
{
ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f, fill_seed(gen)}(b_k_n);
ck_tile::FillUniformDistribution<BQDataType>{125.f, 130.f, fill_seed(gen)}(
*bq_tensor_ptr);
}
else
{
ck_tile::FillUniformDistribution<BDataType>{-2.0f, 3.0f, fill_seed(gen)}(b_k_n);
ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*bq_tensor_ptr);
}
ck_tile::FillUniformDistribution<ADataType>{-5.0f, 5.0f, fill_seed(gen)}(a_m_k);
}
else if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
{
if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
{
ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
a_m_k);
}
else
{
ck_tile::FillUniformDistribution<ADataType>{-2.0f, 3.0f, fill_seed(gen)}(a_m_k);
}
ck_tile::FillUniformDistribution<AQDataType>{2.0f, 2.0f, fill_seed(gen)}(
*aq_tensor_ptr);
ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f, fill_seed(gen)}(b_k_n);
}
else if constexpr(QuantMode == ck_tile::QuantType::ABQuantGrouped)
{
if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
{
ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
a_m_k);
ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
b_k_n);
}
else
{
ck_tile::FillUniformDistribution<ADataType>{-2.0f, 3.0f, fill_seed(gen)}(a_m_k);
ck_tile::FillUniformDistribution<BDataType>{-2.0f, 3.0f, fill_seed(gen)}(b_k_n);
}
ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*aq_tensor_ptr);
ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*bq_tensor_ptr);
}
else
{
ck_tile::FillUniformDistribution<ADataType>{-2.0f, 2.0f, fill_seed(gen)}(a_m_k);
ck_tile::FillUniformDistribution<BDataType>{-2.0f, 2.0f, fill_seed(gen)}(b_k_n);
ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*aq_tensor_ptr);
ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*bq_tensor_ptr);
}
}
else if(init_method == 5)
{
if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
{
if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
{
ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
b_k_n);
ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*bq_tensor_ptr);
}
else if constexpr(std::is_same_v<BDataType, ck_tile::pk_fp4_raw_t>)
{
ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f, fill_seed(gen)}(b_k_n);
ck_tile::FillUniformDistribution<BQDataType>{125.f, 130.f, fill_seed(gen)}(
*bq_tensor_ptr);
}
else
{
ck_tile::FillUniformDistribution<BDataType>{-2.0f, 3.0f, fill_seed(gen)}(b_k_n);
ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*bq_tensor_ptr);
}
ck_tile::FillUniformDistribution<ADataType>{-5.0f, 5.0f, fill_seed(gen)}(a_m_k);
}
else if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
{
if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
{
ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
a_m_k);
}
else
{
ck_tile::FillUniformDistribution<ADataType>{1.0f, 1.0f, fill_seed(gen)}(a_m_k);
}
// Fill aquant such that column j has value 2^j (1, 2, 4, 8, ...)
for(ck_tile::index_t row = 0;
row < static_cast<ck_tile::index_t>(aq_tensor_ptr->get_length(0));
++row)
{
for(ck_tile::index_t col = 0;
col < static_cast<ck_tile::index_t>(aq_tensor_ptr->get_length(1));
++col)
{
(*aq_tensor_ptr)(row, col) = static_cast<AQDataType>(col + 1);
}
}
// std::cout << "aq_tensor_ptr: " << *aq_tensor_ptr << std::endl;
ck_tile::FillUniformDistribution<BDataType>{1.0f, 1.0f, fill_seed(gen)}(b_k_n);
}
else if constexpr(QuantMode == ck_tile::QuantType::ABQuantGrouped)
{
if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
{
ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
a_m_k);
ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
b_k_n);
}
else
{
ck_tile::FillUniformDistribution<ADataType>{-2.0f, 3.0f, fill_seed(gen)}(a_m_k);
ck_tile::FillUniformDistribution<BDataType>{-2.0f, 3.0f, fill_seed(gen)}(b_k_n);
}
ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*aq_tensor_ptr);
ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*bq_tensor_ptr);
}
else
{
ck_tile::FillUniformDistribution<ADataType>{-2.0f, 2.0f, fill_seed(gen)}(a_m_k);
ck_tile::FillUniformDistribution<BDataType>{-2.0f, 2.0f, fill_seed(gen)}(b_k_n);
ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*aq_tensor_ptr);
ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
*bq_tensor_ptr);
}
}
else
{
a_m_k.SetZero();

View File

@@ -58,6 +58,7 @@ consteval BlockGemmSpec SetBlockGemm()
case PipelineVersion::V3: version = ck::BlockGemmPipelineVersion::v3; break;
case PipelineVersion::V4: version = ck::BlockGemmPipelineVersion::v4; break;
case PipelineVersion::V5: version = ck::BlockGemmPipelineVersion::v5; break;
case PipelineVersion::V6: throw "PipelineVersion::V6 is supported only for CK Tile.";
case PipelineVersion::WEIGHT_ONLY:
throw "PipelineVersion::WEIGHT_ONLY is not supported for block GEMM.";
default: throw "Unknown PipelineVersion";
@@ -92,6 +93,7 @@ consteval ck::PipelineVersion SetGridwiseGemmPipelineVersion()
case PipelineVersion::V3: throw "PipelineVersion::V3 is used only for stream-K.";
case PipelineVersion::V4: return ck_pipeline::v4;
case PipelineVersion::V5: throw "PipelineVersion::V5 cannot be used for gridwise GEMM.";
case PipelineVersion::V6: throw "PipelineVersion::V6 can be used only for CK TILE.";
case PipelineVersion::WEIGHT_ONLY: return ck_pipeline::weight_only;
default: throw "Unknown GridwiseGemmPipelineVersion";
}
@@ -137,6 +139,7 @@ consteval ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion()
case PipelineVersion::V3: return ck_pipeline::v3;
case PipelineVersion::V4: return ck_pipeline::v4;
case PipelineVersion::V5: return ck_pipeline::v5;
case PipelineVersion::V6: throw "PipelineVersion::V6 is supported only for CK Tile.";
case PipelineVersion::WEIGHT_ONLY:
throw "PipelineVersion::WEIGHT_ONLY is not supported for block GEMM pipeline version.";
default: throw "Unknown block GEMM PipelineVersion";

View File

@@ -91,6 +91,13 @@ struct TilePipelineType<ck_tile::GemmPipeline::COMPUTE_V5>
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
};
template <>
struct TilePipelineType<ck_tile::GemmPipeline::COMPUTE_V6>
{
template <typename PipelineProblem>
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV6<PipelineProblem>;
};
template <ConvAlgorithmDescriptor auto ALGORITHM>
consteval ck_tile::GemmPipeline SetTileBlockGemmPipelineVersion()
{
@@ -103,6 +110,7 @@ consteval ck_tile::GemmPipeline SetTileBlockGemmPipelineVersion()
case PipelineVersion::V3: return ck_tile_pipeline::COMPUTE_V3;
case PipelineVersion::V4: return ck_tile_pipeline::COMPUTE_V4;
case PipelineVersion::V5: return ck_tile_pipeline::COMPUTE_V5;
case PipelineVersion::V6: return ck_tile_pipeline::COMPUTE_V6;
case PipelineVersion::WEIGHT_ONLY:
throw "PipelineVersion::WEIGHT_ONLY is not supported for block GEMM pipeline version.";
default: throw "Unknown block GEMM PipelineVersion";

View File

@@ -7,26 +7,25 @@
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
#include "ck_tile/builder/testing/testing.hpp"
#include "ck_tile/builder/testing/testing_reflect.hpp"
#include "ck_tile/builder/testing/filter_extent.hpp"
#include "ck_tile/builder/testing/tensor_buffer.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck_tile/builder/testing/tensor_initialization.hpp"
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
#include "ck_tile/builder/testing/validation.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
/// This file implements common functionality for invoking/testing grouped
/// forward convolutions created through the CK Builder API. The main item
/// of it is the ConvArgs structure - which contains a complete description
/// of it is the Args structure - which contains a complete description
/// of a convolution operation.
///
/// It is not intended that this file contains implementation details for
/// actually launching a convolution operation. As this can be done
/// through different APIs depending on the kernel (CK, CK Tile, or a
/// reference implementation), the code dealing with that is split out
/// into a separate header for each implementation.
/// into a separate header for each implementation. Nor does this file
/// deal with details for defining the data types (`Inputs` and `Outputs`)
/// for different conv directions, that is also split out into separate
/// headers to keep this one small.
namespace ck_tile::builder::test {
@@ -56,7 +55,7 @@ struct ConvTensorLengths
///
/// @see Args
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
requires ValidConvSignature<SIGNATURE>
struct Args<SIGNATURE>
{
constexpr static auto SPATIAL_DIM = SIGNATURE.spatial_dim;
@@ -204,53 +203,4 @@ struct Args<SIGNATURE>
}
};
/// @brief `Inputs` specialization for forward convolution.
///
/// @tparam SIGNATURE Forward convolution signature.
///
/// @see Inputs
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
struct Inputs<SIGNATURE>
{
void* input;
void* weight;
static void reflect(const Args<SIGNATURE>& args, const auto& inspect)
{
inspect("input", args.make_input_descriptor(), &Inputs<SIGNATURE>::input);
inspect("weight", args.make_weight_descriptor(), &Inputs<SIGNATURE>::weight);
}
};
/// @brief `Outputs` specialization for forward convolution.
///
/// @tparam SIGNATURE Forward convolution signature.
///
/// @see Outputs
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
struct Outputs<SIGNATURE>
{
void* output;
static void reflect(const Args<SIGNATURE>& args, const auto& inspect)
{
inspect("output", args.make_output_descriptor(), &Outputs<SIGNATURE>::output);
}
};
/// @brief `init_inputs()` specialization for forward convolution.
///
/// @tparam SIGNATURE Forward convolution signature.
///
/// @see alloc_inputs()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
void init_inputs(const Args<SIGNATURE>& args, Inputs<SIGNATURE> inputs)
{
init_tensor_buffer_uniform_fp(inputs.input, args.make_input_descriptor(), -2.0f, 2.0f);
init_tensor_buffer_uniform_fp(inputs.weight, args.make_weight_descriptor(), -2.0f, 2.0f);
}
} // namespace ck_tile::builder::test

View File

@@ -0,0 +1,71 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/builder/testing/tensor_initialization.hpp"
#include "ck_tile/builder/testing/testing_reflect.hpp"
#include "ck_tile/builder/testing/conv/args.hpp"
#include "ck_tile/builder/testing/conv/fwd.hpp"
#include "ck_tile/builder/testing/error.hpp"
/// This file deals with the backward weight-specific details of running grouped
/// convolution backwards weight operations. It mainly defines the data
/// structures (`Input` and `Output`), initialization, and validation. Note
/// that for this operation specifically, many of the operations are
/// implemented automatically via testing_reflect.hpp.
namespace ck_tile::builder::test {
/// @brief `Inputs` specialization for backwards weight convolution.
///
/// @tparam SIGNATURE Backwards weight convolution signature.
///
/// @see Inputs
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsBackwardWeight<SIGNATURE>
struct Inputs<SIGNATURE>
{
void* input;
void* output;
// See testing_reflect.hpp
static void reflect(const Args<SIGNATURE>& args, const auto& inspect)
{
inspect("input", args.make_input_descriptor(), &Inputs<SIGNATURE>::input);
inspect("output", args.make_output_descriptor(), &Inputs<SIGNATURE>::output);
}
};
/// @brief `Outputs` specialization for backwards weight convolution.
///
/// @tparam SIGNATURE Backwards weight convolution signature.
///
/// @see Outputs
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsBackwardWeight<SIGNATURE>
struct Outputs<SIGNATURE>
{
void* weight;
// See testing_reflect.hpp
static void reflect(const Args<SIGNATURE>& args, const auto& inspect)
{
inspect("weight", args.make_weight_descriptor(), &Outputs<SIGNATURE>::weight);
}
};
/// @brief `init_inputs()` specialization for backwards convolution.
///
/// @tparam SIGNATURE Backwards weight convolution signature.
///
/// @see init_inputs()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsBackwardWeight<SIGNATURE>
void init_inputs(const Args<SIGNATURE>& args, Inputs<SIGNATURE> inputs)
{
init_tensor_buffer_uniform_fp(inputs.input, args.make_input_descriptor(), -2.0f, 2.0f);
init_tensor_buffer_uniform_fp(inputs.output, args.make_output_descriptor(), -2.0f, 2.0f);
}
} // namespace ck_tile::builder::test

View File

@@ -0,0 +1,276 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/builder/testing/testing.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
#include <type_traits>
#include <array>
/// This file contains the implementation details for invoking/testing
/// bwd grouped convolution operations in old CK. The main item is the
/// `run()` function, which is the main implementation used to invoke
/// CK grouped forward convolution kernels.
namespace ck_tile::builder::test {
namespace detail {
/// @brief Concept for checking whether a bwd weight convolution is invoked like old CK.
///
/// This is the same as `::ck_tile::builder::test::CkConvBwdWeightInstance`, except
/// with some utility aliases. For that reason, its moved to this detail
/// namespace.
template <typename Conv,
auto SIGNATURE,
size_t SPATIAL_DIM = SIGNATURE.spatial_dim,
// TODO: We shouldn't need to call into an internal namespace here.
typename Types = factory::internal::ConvTensorDataTypes<SIGNATURE>,
typename Ops = factory::internal::ConvElementwiseOps<SIGNATURE>>
concept CkConvBwdWeightInstance = requires(Conv& conv,
const Types::InDataType* p_a,
Types::WeiDataType* p_b,
const Types::OutDataType* p_e,
std::array<index_t, SPATIAL_DIM + 3> lengths,
std::array<index_t, SPATIAL_DIM + 3> strides,
std::array<index_t, SPATIAL_DIM> filter,
Ops::InElementwiseOp elementwise_a,
Ops::WeiElementwiseOp elementwise_b,
Ops::OutElementwiseOp elementwise_cde,
ck::index_t split_k) {
requires ValidConvSignature<SIGNATURE>;
requires ConvDirectionIsBackwardWeight<SIGNATURE>;
{
conv.MakeArgument(p_a,
p_b,
p_e,
// A lengths/strides
lengths,
strides,
// B lengths/strides
lengths,
strides,
// E lengths/strides
lengths,
strides,
// strides/dilations/pads
filter,
filter,
filter,
filter,
// element-wise operations.
elementwise_a,
elementwise_b,
elementwise_cde,
split_k)
};
};
/// @brief Concept for checking whether a bwd weight convolution is multiple-d and
/// invoked like old CK.
///
/// This is the same as `::ck_tile::builder::test::CkConvBwdWeightMultipleDInstance`, except
/// with some utility aliases. For that reason, its moved to this detail
/// namespace.
template <typename Conv,
auto SIGNATURE,
size_t SPATIAL_DIM = SIGNATURE.spatial_dim,
// TODO: We shouldn't need to call into an internal namespace here.
typename Types = factory::internal::ConvTensorDataTypes<SIGNATURE>,
typename Ops = factory::internal::ConvElementwiseOps<SIGNATURE>>
concept CkConvBwdWeightMultipleDInstance = requires(Conv& conv,
const Types::InDataType* p_a,
Types::WeiDataType* p_b,
const Types::OutDataType* p_e,
std::array<index_t, SPATIAL_DIM + 3> lengths,
std::array<index_t, SPATIAL_DIM + 3> strides,
std::array<index_t, SPATIAL_DIM> filter,
Ops::InElementwiseOp elementwise_a,
Ops::WeiElementwiseOp elementwise_b,
Ops::OutElementwiseOp elementwise_cde,
ck::index_t split_k) {
requires ValidConvSignature<SIGNATURE>;
requires ConvDirectionIsBackwardWeight<SIGNATURE>;
{
conv.MakeArgument(p_a,
p_b,
p_e,
// TODO: Actually support multiple d
{},
// A lengths/strides
lengths,
strides,
// B lengths/strides
lengths,
strides,
// E lengths/strides
lengths,
strides,
// TODO: Multiple D lengths/strides
{},
{},
// strides/dilations/pads
filter,
filter,
filter,
filter,
// element-wise operations.
elementwise_a,
elementwise_b,
elementwise_cde,
split_k)
};
};
} // namespace detail
/// @brief Concept for checking whether a bwd weight convolution is invoked like old CK.
///
/// - SIGNATURE is the operation signature.
/// - Conv is a convolution instance created by the CK Builder API.
template <typename Conv, auto SIGNATURE>
concept CkConvBwdWeightInstance = detail::CkConvBwdWeightInstance<Conv, SIGNATURE>;
/// @brief Concept for checking whether a bwd weight convolution is multiple-d and
/// invoked like old CK.
///
/// - SIGNATURE is the operation signature.
/// - Conv is a convolution instance created by the CK Builder API.
template <typename Conv, auto SIGNATURE>
concept CkConvBwdWeightMultipleDInstance =
detail::CkConvBwdWeightMultipleDInstance<Conv, SIGNATURE>;
/// @brief `run()` specialization for backward weight convolution and old CK.
///
/// @tparam SIGNATURE Forward convolution signature.
/// @returns RunResult about how the operation completed (or not).
///
/// @see run()
template <auto SIGNATURE>
[[nodiscard]] RunResult run(CkConvBwdWeightInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs)
{
using Types = factory::internal::ConvTensorDataTypes<SIGNATURE>;
constexpr auto spatial_dim = SIGNATURE.spatial_dim;
const auto copy = [](const auto& src, auto& dst) {
std::copy(src.begin(), src.end(), dst.begin());
};
const auto to_ck_lengths = [&](const auto& src) {
std::array<ck::index_t, spatial_dim + 3> result;
copy(src, result);
return result;
};
const auto to_ck_extent = [&](const auto& extent) {
std::array<ck::index_t, spatial_dim> result;
copy(extent, result);
return result;
};
const auto param = args.to_ck_conv_param();
const auto input_desc = args.make_input_descriptor();
const auto weight_desc = args.make_weight_descriptor();
const auto output_desc = args.make_output_descriptor();
auto ck_args = conv.MakeArgument(static_cast<const Types::InDataType*>(inputs.input),
static_cast<Types::WeiDataType*>(outputs.weight),
static_cast<const Types::OutDataType*>(inputs.output),
to_ck_lengths(input_desc.get_lengths()),
to_ck_lengths(input_desc.get_strides()),
to_ck_lengths(weight_desc.get_lengths()),
to_ck_lengths(weight_desc.get_strides()),
to_ck_lengths(output_desc.get_lengths()),
to_ck_lengths(output_desc.get_strides()),
to_ck_extent(param.conv_filter_strides_),
to_ck_extent(param.conv_filter_dilations_),
to_ck_extent(param.input_left_pads_),
to_ck_extent(param.input_right_pads_),
args.a_elementwise_op,
args.b_elementwise_op,
args.cde_elementwise_op,
args.k_batch);
if(!conv.IsSupportedArgument(ck_args))
return RunResult::not_supported("invalid ck arguments");
return RunResult::from_runtime(conv.MakeInvoker().Run(ck_args, {}));
}
/// @brief `run()` specialization for backward weight convolution and old CK.
///
/// This overload is specialized for Multiple-D.
///
/// @tparam SIGNATURE Forward convolution signature.
/// @returns RunResult about how the operation completed (or not).
///
/// @see run()
template <auto SIGNATURE>
[[nodiscard]] RunResult run(CkConvBwdWeightMultipleDInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs)
{
using Types = factory::internal::ConvTensorDataTypes<SIGNATURE>;
constexpr auto spatial_dim = SIGNATURE.spatial_dim;
const auto copy = [](const auto& src, auto& dst) {
std::copy(src.begin(), src.end(), dst.begin());
};
const auto to_ck_lengths = [&](const auto& src) {
std::array<ck::index_t, spatial_dim + 3> result;
copy(src, result);
return result;
};
const auto to_ck_extent = [&](const auto& extent) {
std::array<ck::index_t, spatial_dim> result;
copy(extent, result);
return result;
};
const auto param = args.to_ck_conv_param();
const auto input_desc = args.make_input_descriptor();
const auto weight_desc = args.make_weight_descriptor();
const auto output_desc = args.make_output_descriptor();
auto ck_args = conv.MakeArgument(static_cast<const Types::InDataType*>(inputs.input),
static_cast<Types::WeiDataType*>(outputs.weight),
static_cast<const Types::OutDataType*>(inputs.output),
{}, // TODO
to_ck_lengths(input_desc.get_lengths()),
to_ck_lengths(input_desc.get_strides()),
to_ck_lengths(weight_desc.get_lengths()),
to_ck_lengths(weight_desc.get_strides()),
to_ck_lengths(output_desc.get_lengths()),
to_ck_lengths(output_desc.get_strides()),
{}, // TODO
{}, // TODO
to_ck_extent(param.conv_filter_strides_),
to_ck_extent(param.conv_filter_dilations_),
to_ck_extent(param.input_left_pads_),
to_ck_extent(param.input_right_pads_),
args.a_elementwise_op,
args.b_elementwise_op,
args.cde_elementwise_op,
args.k_batch);
if(!conv.IsSupportedArgument(ck_args))
return RunResult::not_supported("invalid ck arguments");
return RunResult::from_runtime(conv.MakeInvoker().Run(ck_args, {}));
}
} // namespace ck_tile::builder::test

View File

@@ -3,9 +3,8 @@
#pragma once
#include "ck_tile/builder/testing/conv_fwd.hpp"
#include "ck_tile/builder/testing/testing.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "ck_tile/ops/grouped_convolution.hpp"
#include <type_traits>
@@ -28,9 +27,39 @@ namespace detail {
/// namespace.
template <typename Conv, auto SIGNATURE>
concept CkTileConvInstance = requires(Conv&) {
requires ValidConvSignature<SIGNATURE>;
{ Conv::BlockSize() };
};
template <auto SIGNATURE, typename InDataType, typename WeiDataType, typename OutDataType>
[[nodiscard]] RunResult run(CkTileConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
InDataType* input,
WeiDataType* weight,
OutDataType* output,
const ck_tile::stream_config s_conf)
{
using Conv = std::remove_reference_t<decltype(conv)>;
const auto param = args.to_ck_tile_conv_param();
ck_tile::GroupedConvHostArgs<InDataType*, WeiDataType*, OutDataType*, ck_tile::PassThrough>
host_args(param, input, weight, {}, output, args.k_batch);
auto kargs = Conv::MakeKernelArgs(host_args);
const dim3 grids = Conv::GridSize(kargs);
const dim3 blocks = Conv::BlockSize();
if(!Conv::IsSupportedArgument(kargs))
return RunResult::not_supported("unsupported ck_tile arguments");
constexpr index_t minimum_occupancy =
Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2;
return RunResult::from_runtime(ck_tile::launch_kernel(
s_conf, ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs)));
}
} // namespace detail
/// @brief Concept for checking whether a convolution is invoked like CK Tile.
@@ -48,44 +77,45 @@ concept CkTileConvInstance = detail::CkTileConvInstance<Conv, SIGNATURE>;
/// @brief `run()` specialization for forward convolution and CK Tile.
///
/// @tparam SIGNATURE Forward convolution signature.
/// @throws std::runtime_error if the arguments weren't actually valid for the
/// operation. This should be caught and reported by the testing framework.
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f if s_conf time_kernel is false).
/// @returns RunResult about how the operation completed (or not).
///
/// @see run()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
std::tuple<bool, float> run(CkTileConvInstance<SIGNATURE> auto& conv,
requires ConvDirectionIsForward<SIGNATURE>
[[nodiscard]] RunResult run(CkTileConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs,
const ck_tile::stream_config s_conf = {})
{
using Conv = std::remove_reference_t<decltype(conv)>;
const auto param = args.to_ck_tile_conv_param();
return detail::run(conv,
args,
static_cast<const void*>(inputs.input),
static_cast<const void*>(inputs.weight),
static_cast<void*>(outputs.output),
s_conf);
}
ck_tile::GroupedConvFwdHostArgs<> host_args(
param, inputs.input, inputs.weight, {}, outputs.output, args.k_batch);
auto kargs = Conv::MakeKernelArgs(host_args);
const dim3 grids = Conv::GridSize(kargs);
const dim3 blocks = Conv::BlockSize();
if(!Conv::IsSupportedArgument(kargs))
{
std::cout << "Not supported!";
return std::make_tuple(false, 0.f);
}
constexpr index_t minimum_occupancy =
Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2;
return std::make_tuple(
true,
ck_tile::launch_kernel(
s_conf, ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs)));
/// @brief `run()` specialization for backwards weight convolution and CK Tile.
///
/// @tparam SIGNATURE Backwards weight convolution signature.
/// @returns RunResult about how the operation completed (or not).
///
/// @see run()
template <auto SIGNATURE>
requires ConvDirectionIsBackwardWeight<SIGNATURE>
[[nodiscard]] RunResult run(CkTileConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs,
const ck_tile::stream_config s_conf = {})
{
return detail::run(conv,
args,
static_cast<const void*>(inputs.input),
static_cast<void*>(outputs.weight),
static_cast<const void*>(inputs.output),
s_conf);
}
} // namespace ck_tile::builder::test

View File

@@ -0,0 +1,69 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/builder/testing/tensor_initialization.hpp"
#include "ck_tile/builder/testing/testing_reflect.hpp"
#include "ck_tile/builder/testing/conv/args.hpp"
/// This file deals with the forward-specific details of running grouped
/// convolution forward operations. It mainly defines the data structures
/// (`Input` and `Output`), initialization, and validation. Note that
/// for this operation specifically, many of the operations are implemented
/// automatically via testing_reflect.hpp.
namespace ck_tile::builder::test {
/// @brief `Inputs` specialization for forward convolution.
///
/// @tparam SIGNATURE Forward convolution signature.
///
/// @see Inputs
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
struct Inputs<SIGNATURE>
{
void* input;
void* weight;
// See testing_reflect.hpp
static void reflect(const Args<SIGNATURE>& args, const auto& inspect)
{
inspect("input", args.make_input_descriptor(), &Inputs<SIGNATURE>::input);
inspect("weight", args.make_weight_descriptor(), &Inputs<SIGNATURE>::weight);
}
};
/// @brief `Outputs` specialization for forward convolution.
///
/// @tparam SIGNATURE Forward convolution signature.
///
/// @see Outputs
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
struct Outputs<SIGNATURE>
{
void* output;
// See testing_reflect.hpp
static void reflect(const Args<SIGNATURE>& args, const auto& inspect)
{
inspect("output", args.make_output_descriptor(), &Outputs<SIGNATURE>::output);
}
};
/// @brief `init_inputs()` specialization for forward convolution.
///
/// @tparam SIGNATURE Forward convolution signature.
///
/// @see init_inputs()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
void init_inputs(const Args<SIGNATURE>& args, Inputs<SIGNATURE> inputs)
{
init_tensor_buffer_uniform_fp(inputs.input, args.make_input_descriptor(), -2.0f, 2.0f);
init_tensor_buffer_uniform_fp(inputs.weight, args.make_weight_descriptor(), -2.0f, 2.0f);
}
} // namespace ck_tile::builder::test

View File

@@ -3,14 +3,14 @@
#pragma once
#include "ck_tile/builder/testing/conv_fwd.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/builder/testing/testing.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include <type_traits>
#include <array>
/// This file contains the implementation details for invoking/testing
/// grouped convolution operations in old CK. The main item is the
/// fwd grouped convolution operations in old CK. The main item is the
/// `run()` function, which is the main implementation used to invoke
/// CK grouped forward convolution kernels.
@@ -18,10 +18,9 @@ namespace ck_tile::builder::test {
namespace detail {
/// @brief Concept for checking whether this is the reference convolution
/// implementation.
/// @brief Concept for checking whether a fwd convolution is invoked like old CK.
///
/// This is the same as `::ck_tile::builder::test::CkConvInstance`, except
/// This is the same as `::ck_tile::builder::test::CkConvFwdInstance`, except
/// with some utility aliases. For that reason, its moved to this detail
/// namespace.
template <typename Conv,
@@ -29,18 +28,21 @@ template <typename Conv,
size_t SPATIAL_DIM = SIGNATURE.spatial_dim,
// TODO: We shouldn't need to call into an internal namespace here.
typename Ops = factory::internal::ConvElementwiseOps<SIGNATURE>>
concept CkConvInstance = requires(Conv& conv,
// TODO: This should be changed depending on IsMultiA etc.
// Currently that is not yet supported elsewhere anyway.
const void* p_a,
const void* p_b,
void* p_e,
std::array<index_t, SPATIAL_DIM + 3> lengths,
std::array<index_t, SPATIAL_DIM + 3> strides,
std::array<index_t, SPATIAL_DIM> filter,
Ops::InElementwiseOp elementwise_a,
Ops::WeiElementwiseOp elementwise_b,
Ops::OutElementwiseOp elementwise_cde) {
concept CkConvFwdInstance = requires(Conv& conv,
// TODO: This should be changed depending on IsMultiA etc.
// Currently that is not yet supported elsewhere anyway.
const void* p_a,
const void* p_b,
void* p_e,
std::array<index_t, SPATIAL_DIM + 3> lengths,
std::array<index_t, SPATIAL_DIM + 3> strides,
std::array<index_t, SPATIAL_DIM> filter,
Ops::InElementwiseOp elementwise_a,
Ops::WeiElementwiseOp elementwise_b,
Ops::OutElementwiseOp elementwise_cde) {
requires ValidConvSignature<SIGNATURE>;
requires ConvDirectionIsForward<SIGNATURE>;
{
conv.MakeArgument(p_a,
p_b,
@@ -73,7 +75,7 @@ concept CkConvInstance = requires(Conv& conv,
} // namespace detail
/// @brief Concept for checking whether a convolution is invoked like old CK.
/// @brief Concept for checking whether a fwd convolution is invoked like old CK.
///
/// This concept is used to tell whether a convolution implementation is
/// likely to be an "old CK" implementation - that is, whether we should
@@ -83,20 +85,17 @@ concept CkConvInstance = requires(Conv& conv,
/// - SIGNATURE is the operation signature.
/// - Conv is a convolution instance created by the CK Builder API.
template <typename Conv, auto SIGNATURE>
concept CkConvInstance = detail::CkConvInstance<Conv, SIGNATURE>;
concept CkConvFwdInstance = detail::CkConvFwdInstance<Conv, SIGNATURE>;
/// @brief `run()` specialization for forward convolution and old CK.
///
/// @tparam SIGNATURE Forward convolution signature.
/// @throws std::runtime_error if the arguments weren't actually valid for the
/// operation. This should be caught and reported by the testing framework.
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f if s_conf time_kernel is false).
/// @returns RunResult about how the operation completed (or not).
///
/// @see run()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
std::tuple<bool, float> run(CkConvInstance<SIGNATURE> auto& conv,
[[nodiscard]] RunResult run(CkConvFwdInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs,
@@ -126,6 +125,9 @@ std::tuple<bool, float> run(CkConvInstance<SIGNATURE> auto& conv,
const auto weight_desc = args.make_weight_descriptor();
const auto output_desc = args.make_output_descriptor();
if(args.k_batch != 1)
return RunResult::not_supported("ck fwd does not support k_batch != 1");
auto ck_args = conv.MakeArgument(inputs.input,
inputs.weight,
{},
@@ -147,11 +149,9 @@ std::tuple<bool, float> run(CkConvInstance<SIGNATURE> auto& conv,
args.cde_elementwise_op);
if(!conv.IsSupportedArgument(ck_args))
{
std::cout << "invalid argument" << std::endl;
}
return RunResult::not_supported("unsupported ck arguments");
return std::make_tuple(true, conv.MakeInvoker().Run(ck_args, s_conf));
return RunResult::from_runtime(conv.MakeInvoker().Run(ck_args, s_conf));
}
} // namespace ck_tile::builder::test

View File

@@ -0,0 +1,137 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/builder/testing/testing.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include <stdexcept>
#include <vector>
/// This file contains the implementation details for invoking/testing
/// grouped convolution operations using the reference implementation.
/// The main item is the `run()` function, which is the primary way to
/// invoke the reference execution mechanism.
/// The implementation of this file mostly looks like `conv_fwd_ck.hpp`,
/// but its made specific to the reference implementation, which is
/// invoked in a slightly different way.
namespace ck_tile::builder::test {
namespace detail {
/// @brief Concept for checking whether this is the reference convolution
/// implementation.
///
/// This concept is used to tell whether a convolution implementation is
/// likely to be the reference implementation - that is, whether we should
/// invoke it like the reference kernel. This is mainly used with `run()` to
/// differentiate which implementation that should be invoked.
///
/// - SIGNATURE is the operation signature.
/// - Conv is a convolution instance created by the CK Builder API.
/// - InDataType, WeiDataType, OutDataType are the types of the respective tensors.
template <typename Conv,
auto SIGNATURE,
typename InDataType,
typename WeiDataType,
typename OutDataType>
concept RefConvInstance = requires(Conv& conv,
InDataType* input,
WeiDataType* weight,
OutDataType* output,
ck::utils::conv::ConvParam param) {
requires ValidConvSignature<SIGNATURE>;
{ conv.Run(input, weight, output, param) };
};
/// @brief Generic `run` implementation for forward/backwards reference kernels.
///
/// @tparam SIGNATURE The signature of the operation to perform.
///
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f for reference).
/// @see run()
template <auto SIGNATURE, typename InDataType, typename WeiDataType, typename OutDataType>
[[nodiscard]] RunResult
run(RefConvInstance<SIGNATURE, InDataType, WeiDataType, OutDataType> auto& conv,
const Args<SIGNATURE>& args,
InDataType* input,
WeiDataType* weight,
OutDataType* output)
{
// We don't want to compute the output dims manually, just get
// them via the existing infrastructure
const auto param = args.to_ck_conv_param();
// TODO: The reference convolution is currently missing a few features.
// Just throw for now, but regard these as TODO items that should be resolved
// eventually.
if(!args.make_input_descriptor().is_packed())
return RunResult::not_supported("TODO: Support non-packed input tensor in reference conv");
if(!args.make_weight_descriptor().is_packed())
return RunResult::not_supported("TODO: Support non-packed weight tensor in reference conv");
if(!args.make_output_descriptor().is_packed())
return RunResult::not_supported("TODO: Support non-packed output tensor in reference conv");
conv.Run(input, weight, output, param);
return RunResult::from_runtime(0); // ref conv does not return a meaningful runtime.
}
} // namespace detail
/// @brief Concept for checking whether this is the reference convolution
/// forward implementation.
template <typename Conv, auto SIGNATURE>
concept RefConvFwdInstance =
detail::RefConvInstance<Conv, SIGNATURE, const void*, const void*, void*> &&
ConvDirectionIsForward<SIGNATURE>;
/// @brief `run()` specialization for forward convolution and the reference
/// forward implementation.
///
/// @tparam SIGNATURE The signature of the operation to perform. Must be forwards.
/// @returns RunResult about how the operation completed (or not).
///
/// @see run()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> &&
// TODO: Maybe we can unify this implementation for bwd/weight too?
// for now, just concern outselves with reference and see when the
// rest of the bwd/weight plumbing is there.
ConvDirectionIsForward<SIGNATURE>
[[nodiscard]] RunResult run(RefConvFwdInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs)
{
return detail::run(conv, args, inputs.input, inputs.weight, outputs.output);
}
/// @brief Concept for checking whether this is the reference convolution
/// backward weight implementation.
template <typename Conv, auto SIGNATURE>
concept RefConvBwdWeightInstance =
detail::RefConvInstance<Conv, SIGNATURE, const void*, void*, const void*> &&
ConvDirectionIsBackwardWeight<SIGNATURE>;
/// @brief `run()` specialization for forward convolution and the reference
/// backward weight implementation.
///
/// @tparam SIGNATURE The signature of the operation to perform. Must be backwards weight.
/// @returns RunResult about how the operation completed (or not).
///
/// @see run()
template <auto SIGNATURE>
[[nodiscard]] RunResult run(RefConvBwdWeightInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs)
{
return detail::run(conv, args, inputs.input, outputs.weight, inputs.output);
}
} // namespace ck_tile::builder::test

View File

@@ -1,88 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/builder/testing/conv_fwd.hpp"
#include <stdexcept>
#include <vector>
/// This file contains the implementation details for invoking/testing
/// grouped convolution operations using the reference implementation.
/// The main item is the `run()` function, which is the primary way to
/// invoke the reference execution mechanism.
/// The implementation of this file mostly looks like `conv_fwd_ck.hpp`,
/// but its made specific to the reference implementation, which is
/// invoked in a slightly different way.
namespace ck_tile::builder::test {
/// @brief Concept for checking whether this is the reference convolution
/// implementation.
///
/// This concept is used to tell whether a convolution implementation is
/// likely to be the reference implementation - that is, whether we should
/// invoke it like the reference kernel. This is mainly used with `run()` to
/// differentiate which implementation that should be invoked.
///
/// - SIGNATURE is the operation signature.
/// - Conv is a convolution instance created by the CK Builder API.
template <typename Conv, auto SIGNATURE>
concept RefConvInstance = requires(Conv& conv,
const void* input,
const void* weight,
void* output,
ck::utils::conv::ConvParam param) {
{ conv.Run(input, weight, output, param) };
};
/// @brief `run()` specialization for forward convolution and the reference
/// implementation.
///
/// @tparam SIGNATURE Forward convolution signature.
/// @throws std::runtime_error if the arguments weren't actually valid for the
/// operation. This should be caught and reported by the testing framework.
///
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f for reference).
/// @see run()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> &&
// TODO: Maybe we can unify this implementation for bwd/weight too?
// for now, just concern outselves with reference and see when the
// rest of the bwd/weight plumbing is there.
ConvDirectionIsForward<SIGNATURE>
std::tuple<bool, float> run(RefConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs)
{
// We don't want to compute the output dims manually, just get
// them via the existing infrastructure
const auto param = args.to_ck_conv_param();
// TODO: The reference convolution is currently missing a few features.
// Just throw for now, but regard these as TODO items that should be resolved
// eventually.
if(!args.make_input_descriptor().is_packed())
{
std::cout << "TODO: Support non-packed input tensor in reference conv" << std::endl;
return std::make_tuple(false, 0.0f);
}
if(!args.make_weight_descriptor().is_packed())
{
std::cout << "TODO: Support non-packed weight tensor in reference conv" << std::endl;
return std::make_tuple(false, 0.0f);
}
if(!args.make_output_descriptor().is_packed())
{
std::cout << "TODO: Support non-packed output tensor in reference conv" << std::endl;
return std::make_tuple(false, 0.0f);
}
conv.Run(inputs.input, inputs.weight, outputs.output, param);
return std::make_tuple(true, 0.0f);
}
} // namespace ck_tile::builder::test

View File

@@ -12,6 +12,7 @@
#include "ck_tile/builder/conv_signature_concepts.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
#include "ck_tile/builder/testing/type_traits.hpp"
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include "ck/utility/data_type.hpp"

View File

@@ -3,7 +3,11 @@
#pragma once
#include <optional>
#include <concepts>
#include <string_view>
#include <string>
#include <iosfwd>
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
#include "ck_tile/builder/testing/tensor_buffer.hpp"
@@ -288,6 +292,57 @@ ValidationReport validate(const Args<SIGNATURE>& args,
Outputs<SIGNATURE> actual,
Outputs<SIGNATURE> expected) = delete;
/// @brief This structure represents the result of a run operation.
///
/// The structure contains multiple fields with information about
/// how the operation completed (or not). See those for more info.
struct RunResult
{
/// If this value is not set to `std::nullopt`, there was a problem
/// while running the algorithm. In this case, the outputs are not
/// valid (though may be partially or completely overwritten), and
/// the optional contains a short debug message that indicates the
/// problem.
std::optional<std::string> error = std::nullopt;
/// The runtime of the kernel in milliseconds, if measured. Whether the
/// runtime is measured at all depends on the stream configuration
/// passed to run(). 0 if not measured or if there was an error. This
/// value is averaged over the total amount of runs actually done. Again,
/// this is usually configured via the stream config.
float runtime = 0.f;
/// @brief Utility function for constructing a RunResult from an unsupported operation.
///
/// @param msg A short debug message that will be included in the result.
constexpr static RunResult not_supported(std::string_view msg)
{
return RunResult{.error = std::string(msg)};
}
/// @brief Utility function for constructing a RunResult from an average runtime,
/// indicating a successful operation.
///
/// @param runtime The runtime of the kernel in milliseconds.
constexpr static RunResult from_runtime(const float runtime)
{
return RunResult{.runtime = runtime};
}
/// @brief Returns whether this algorithm executed successfully.
///
/// In this case there should be no message in `error`.
bool is_supported() const { return !this->error.has_value(); }
};
inline std::ostream& operator<<(std::ostream& os, const RunResult& result)
{
if(result.error.has_value())
return os << "invalid run (" << result.error.value() << ")";
else
return os << "successful run (" << result.runtime << " ms)";
}
/// @brief Invoke a device operation created by CK Builder.
///
/// This is the main function used to invoke a particular device operation
@@ -318,13 +373,14 @@ ValidationReport validate(const Args<SIGNATURE>& args,
/// @param outputs The output tensor data. The contents will be overwritten by
/// this function.
/// @param s_conf Stream config used to launch kernel.
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f if s_conf time_kernel is false).
/// @returns RunResult about how the operation completed (or not).
///
/// @note This function is explicitly deleted to generate compile errors
/// for missing implementations.
///
/// @see RunResult
template <auto SIGNATURE, typename Operation, typename StreamConf>
std::tuple<bool, float> run(Operation& operation,
[[nodiscard]] RunResult run(Operation& operation,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs,

View File

@@ -5,6 +5,8 @@
#include <string_view>
#include "ck_tile/builder/testing/testing.hpp"
/// testing.hpp requires developers of a type of SIGNATURE to implement
/// quite a lot of functionality for each SIGNATURE. For example, next
/// to `Args`, `Inputs`, `Outputs`, `run`, they also have to define

View File

@@ -51,6 +51,9 @@ struct ValidationReport
/// The number of elements which were bitwise 0.
uint64_t zero_elements;
// Max error.
double max_error;
/// @brief Check whether both the output and reference tensor were both all zeros.
///
/// If both tensors are all zero, it indicates either an incorrect testing setup
@@ -133,11 +136,12 @@ bool ValidationReport::check(std::string_view tensor_name,
// Initial pass: count errors
// Allocate and reset counter
auto d_counters = alloc_buffer(sizeof(uint64_t) * 2);
check_hip(hipMemset(d_counters.get(), 0, sizeof(uint64_t) * 2));
auto d_counters = alloc_buffer(sizeof(uint64_t) * 3);
check_hip(hipMemset(d_counters.get(), 0, sizeof(uint64_t) * 3));
auto d_error_count = &reinterpret_cast<uint64_t*>(d_counters.get())[0];
auto d_zero_count = &reinterpret_cast<uint64_t*>(d_counters.get())[1];
auto d_max_error = &reinterpret_cast<double*>(d_counters.get())[2];
tensor_foreach(descriptor.get_lengths(), [=](auto index) {
using CKType = typename factory::internal::DataTypeToCK<DT>::type;
@@ -157,6 +161,7 @@ bool ValidationReport::check(std::string_view tensor_name,
const auto r = static_cast<double>(type_convert<float>(b));
const auto err = std::abs(o - r);
atomicMax(d_max_error, err);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{
// We expect the number of errors to be very low, so just use an atomic
@@ -188,6 +193,8 @@ bool ValidationReport::check(std::string_view tensor_name,
check_hip(hipMemcpy(&error_count, d_error_count, sizeof(uint64_t), hipMemcpyDeviceToHost));
uint64_t zero_count = 0;
check_hip(hipMemcpy(&zero_count, d_zero_count, sizeof(uint64_t), hipMemcpyDeviceToHost));
double max_error = 0;
check_hip(hipMemcpy(&max_error, d_max_error, sizeof(double), hipMemcpyDeviceToHost));
// TODO: Gather detailed coordinates.
@@ -196,6 +203,7 @@ bool ValidationReport::check(std::string_view tensor_name,
.wrong_elements = error_count,
.total_elements = descriptor.get_element_size(),
.zero_elements = zero_count,
.max_error = max_error,
});
return reports_.back().is_ok();

View File

@@ -157,6 +157,7 @@ enum class PipelineVersion
V3,
V4,
V5,
V6,
WEIGHT_ONLY
};
@@ -328,6 +329,7 @@ inline std::string_view to_string(PipelineVersion ver)
case V3: return "V3";
case V4: return "V4";
case V5: return "V5";
case V6: return "V6";
case WEIGHT_ONLY: return "WEIGHT_ONLY";
default: return "Unknown";
}

View File

@@ -168,7 +168,7 @@ add_ck_builder_test(test_ckb_build_fwd_instances
conv/ck/test_ckb_conv_fwd_3d_fp16.cpp
conv/ck/test_ckb_conv_fwd_3d_fp32.cpp
conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp
)
)
target_link_libraries(test_ckb_build_fwd_instances PRIVATE utility)
set(BWD_WEIGHT_TESTS

View File

@@ -1,23 +1,30 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include "ck_tile/builder/testing/conv/bwd_weight.hpp"
#include "ck_tile/builder/testing/conv/bwd_weight_ck.hpp"
#include "ck_tile/builder/testing/conv/reference.hpp"
#include "ck_tile/host/device_prop.hpp"
#include "utils/ckb_conv_test_configs.hpp"
#include "utils/ckb_conv_test_utils.hpp"
#include "utils/conv_algorithm_type_utils.hpp"
#include "ck_tile/host/device_prop.hpp"
#include "testing_utils.hpp"
namespace ckb = ck_tile::builder;
namespace ckt = ck_tile::builder::test;
namespace cku = ck_tile::builder::test_utils;
using enum ck_tile::builder::TensorLayout;
using ck_tile::test::MatchesReference;
using ck_tile::test::SuccessfulRun;
constexpr auto SIGNATURE = ckt::ConvSignature{.spatial_dim = 1,
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
.data_type = ckb::DataType::BF16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = NGCW}},
.input = {.config = {.layout = GNWC}},
.weight = {.config = {.layout = GKXC}},
.output = {.config = {.layout = NGKW}}};
.output = {.config = {.layout = GNWK}}};
constexpr auto ALGORITHM =
cku::ConvAlgorithm_DeviceGroupedConvBwdWeight_Xdl_CShuffle_V3{}
@@ -30,14 +37,58 @@ constexpr auto ALGORITHM =
using Builder = ckb::ConvBuilder<SIGNATURE, ALGORITHM>;
using Instance = Builder::Instance;
using Reference = ckb::ConvBuilder<SIGNATURE, ckt::ConvAlgorithm_Reference{}>::Instance;
TEST(BwdWeight_1DBf16_CShuffle_V3, Create)
{
const auto expected_transfer_parameters = to_string(ALGORITHM);
cku::run_test<Builder>({"DeviceGroupedConvBwdWeight_Xdl_CShuffleV3",
expected_transfer_parameters,
"Filter1x1Stride1Pad0",
"NGCW,GKXC,NGKW",
"GNWC,GKXC,GNWK",
"PassThrough,PassThrough,PassThrough",
"Intrawave",
"v2"});
}
TEST(BwdWeight_1DBf16_CShuffle_V3, Execution)
{
if(!ck_tile::get_device_name().starts_with("gfx9"))
{
// Note: XDL kernel
GTEST_SKIP() << "unsupported architecture";
}
ckt::Args<SIGNATURE> args = {
.lengths =
{
.batch_size = 16,
.groups = 1,
.input_channels = 32,
.output_channels = 48,
.image = {.width = 64},
.filter = {.width = 1},
},
.filter_strides = {.width = 1},
.filter_dilation = {.width = 1},
.input_left_pad = {.width = 0},
.input_right_pad = {.width = 0},
.a_elementwise_op = {},
.b_elementwise_op = {},
.cde_elementwise_op = {},
};
auto inputs = ckt::alloc_inputs(args);
auto outputs = ckt::alloc_outputs(args);
auto reference = ckt::alloc_outputs(args);
ckt::init_inputs(args, inputs.get());
auto conv = Instance{};
EXPECT_THAT(ckt::run(conv, args, inputs.get(), outputs.get()), SuccessfulRun());
auto ref_conv = Reference{};
EXPECT_THAT(ckt::run(ref_conv, args, inputs.get(), reference.get()), SuccessfulRun());
EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get()));
}

View File

@@ -4,8 +4,9 @@
#include "utils/ckb_conv_test_configs.hpp"
#include "utils/ckb_conv_test_utils.hpp"
#include "utils/conv_algorithm_type_utils.hpp"
#include "ck_tile/builder/testing/conv_fwd_ck.hpp"
#include "ck_tile/builder/testing/conv_fwd_reference.hpp"
#include "ck_tile/builder/testing/conv/fwd.hpp"
#include "ck_tile/builder/testing/conv/fwd_ck.hpp"
#include "ck_tile/builder/testing/conv/reference.hpp"
#include "ck_tile/host/device_prop.hpp"
#include "testing_utils.hpp"
@@ -14,6 +15,7 @@ namespace ckt = ck_tile::builder::test;
namespace cku = ck_tile::builder::test_utils;
using ck_tile::test::MatchesReference;
using ck_tile::test::SuccessfulRun;
constexpr auto SIGNATURE =
ckt::ConvSignature{.spatial_dim = 2,
@@ -50,10 +52,11 @@ TEST(Fwd2DFp16_CShufV3_GNHWC, Create)
"MNKPadding"});
}
TEST(Fwd2DFp16_CShufV3_GNHWC, EndToEnd)
TEST(Fwd2DFp16_CShufV3_GNHWC, Execution)
{
if(!ck_tile::get_device_name().starts_with("gfx9"))
{
// Note: XDL kernel
GTEST_SKIP() << "unsupported architecture";
}
@@ -91,10 +94,10 @@ TEST(Fwd2DFp16_CShufV3_GNHWC, EndToEnd)
ckt::init_inputs(args, inputs.get());
auto conv = Instance{};
ckt::run(conv, args, inputs.get(), outputs.get());
EXPECT_THAT(ckt::run(conv, args, inputs.get(), outputs.get()), SuccessfulRun());
auto ref_conv = Reference{};
ckt::run(ref_conv, args, inputs.get(), reference.get());
EXPECT_THAT(ckt::run(ref_conv, args, inputs.get(), reference.get()), SuccessfulRun());
EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get()));
}

View File

@@ -1,35 +1,47 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include "ck_tile/builder/testing/conv/bwd_weight.hpp"
#include "ck_tile/builder/testing/conv/ck_tile.hpp"
#include "ck_tile/builder/testing/conv/reference.hpp"
#include "ck_tile/host/device_prop.hpp"
#include "utils/ckb_conv_tile_test_configs.hpp"
#include "utils/ckb_conv_test_utils.hpp"
#include "testing_utils.hpp"
namespace {
namespace ckb = ck_tile::builder;
namespace ckt = ck_tile::builder::test;
namespace cku = ck_tile::builder::test_utils;
using namespace ck_tile::builder::test_utils;
using enum ck_tile::builder::TensorLayout;
using ck_tile::test::MatchesReference;
using ck_tile::test::SuccessfulRun;
TEST(BwdWeightConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_2D_FP16_NHWGC)
constexpr auto SIGNATURE = cku::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
.data_type = ckb::DataType::FP16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = NHWGC}},
.weight = {.config = {.layout = GKYXC}},
.output = {.config = {.layout = NHWGK}}};
constexpr auto ALGORITHM =
cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{}
.with_tile_specializations(ckb::TileConvSpecialization::DEFAULT)
.with_tile_thread_block(cku::TileThreadBlock_64x64x64)
.with_tile_block_gemm(cku::TileBlockGemmDesc_16x16_v3_intrawave)
.with_tile_transfer(cku::TileTransfer_4x4x4)
.with_tile_optimizations(ckt::TileOptimizations{
.num_groups_to_merge = 1, .split_image = false, .explicit_gemm = false});
using Builder = ckb::ConvBuilder<SIGNATURE, ALGORITHM>;
using Instance = Builder::Instance;
using Reference = ckb::ConvBuilder<SIGNATURE, ckt::ConvAlgorithm_Reference{}>::Instance;
TEST(BwdWeight_2D_FP16_NHWGC, Create)
{
constexpr ConvSignature BwdWeightConvSignature{
.spatial_dim = 2,
.direction = ConvDirection::BACKWARD_WEIGHT,
.data_type = DataType::FP16,
.accumulation_data_type = DataType::FP32,
.input = {.config = {.layout = TensorLayout::NHWGC}},
.weight = {.config = {.layout = TensorLayout::GKYXC}},
.output = {.config = {.layout = TensorLayout::NHWGK}}};
constexpr auto BwdWeightConvAlgorithm =
ConvAlgorithm_Tile_GroupedConvolutionKernel{}
.with_tile_specializations(TileConvSpecialization::DEFAULT)
.with_tile_thread_block(TileThreadBlock_64x64x64)
.with_tile_block_gemm(TileBlockGemmDesc_16x16_v3_intrawave)
.with_tile_transfer(TileTransfer_4x4x4)
.with_tile_optimizations(TileOptimizations{
.num_groups_to_merge = 1, .split_image = false, .explicit_gemm = false});
using Builder = ConvBuilder<BwdWeightConvSignature, BwdWeightConvAlgorithm>;
run_ck_tile_test<Builder>({
cku::run_ck_tile_test<Builder>({
"grouped_convolution_backward_weight",
"fp16",
"NHWGC_GKYXC_NHWGK",
@@ -49,4 +61,38 @@ TEST(BwdWeightConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_
});
}
} // namespace
TEST(BwdWeight_2D_FP16_NHWGC, Execution)
{
ckt::Args<SIGNATURE> args = {
.lengths =
{
.batch_size = 2,
.groups = 4,
.input_channels = 32,
.output_channels = 48,
.image = {.width = 32, .height = 56},
.filter = {.width = 3, .height = 3},
},
.filter_strides = {.width = 1, .height = 1},
.filter_dilation = {.width = 1, .height = 1},
.input_left_pad = {.width = 0, .height = 0},
.input_right_pad = {.width = 0, .height = 0},
.a_elementwise_op = {},
.b_elementwise_op = {},
.cde_elementwise_op = {},
};
auto inputs = ckt::alloc_inputs(args);
auto outputs = ckt::alloc_outputs(args);
auto reference = ckt::alloc_outputs(args);
ckt::init_inputs(args, inputs.get());
auto conv = Instance{};
EXPECT_THAT(ckt::run(conv, args, inputs.get(), outputs.get()), SuccessfulRun());
auto ref_conv = Reference{};
EXPECT_THAT(ckt::run(ref_conv, args, inputs.get(), reference.get()), SuccessfulRun());
EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get()));
}

View File

@@ -4,8 +4,8 @@
#include "utils/ckb_conv_tile_test_configs.hpp"
#include "utils/ckb_conv_test_utils.hpp"
#include "utils/conv_algorithm_type_utils.hpp"
#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp"
#include "ck_tile/builder/testing/conv_fwd_reference.hpp"
#include "ck_tile/builder/testing/conv/ck_tile.hpp"
#include "ck_tile/builder/testing/conv/reference.hpp"
#include "ck_tile/host/device_prop.hpp"
#include "testing_utils.hpp"
@@ -13,6 +13,9 @@ namespace ckb = ck_tile::builder;
namespace ckt = ck_tile::builder::test;
namespace cku = ck_tile::builder::test_utils;
using ck_tile::test::MatchesReference;
using ck_tile::test::SuccessfulRun;
constexpr auto SIGNATURE =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::FORWARD,
@@ -75,10 +78,10 @@ TEST(Fwd2DFp16_CShufV3_NHWGC, EndToEnd)
ckt::init_inputs(args, inputs.get());
auto conv = Instance{};
ckt::run(conv, args, inputs.get(), outputs.get());
EXPECT_THAT(ckt::run(conv, args, inputs.get(), outputs.get()), SuccessfulRun());
auto ref_conv = Reference{};
ckt::run(ref_conv, args, inputs.get(), reference.get());
EXPECT_THAT(ckt::run(ref_conv, args, inputs.get(), reference.get()), SuccessfulRun());
EXPECT_THAT(outputs.get(), ck_tile::test::MatchesReference(args, reference.get()));
EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get()));
}

View File

@@ -5,11 +5,14 @@
#include "testing_utils.hpp"
namespace ckt = ck_tile::builder::test;
using ck_tile::test::HipError;
using ck_tile::test::HipSuccess;
using ck_tile::test::InstanceMatcher;
using ck_tile::test::InstanceSet;
using ck_tile::test::StringEqWithDiff;
using ck_tile::test::SuccessfulRun;
TEST(InstanceSet, FromFactory)
{
@@ -107,3 +110,17 @@ TEST(HipStatusMatcher, Basic)
EXPECT_THAT(hipSuccess, Not(HipError(hipErrorInvalidValue)));
EXPECT_THAT(hipErrorOutOfMemory, Not(HipError(hipErrorInvalidValue)));
}
TEST(RunResultMatcher, Basic)
{
EXPECT_THAT(ckt::RunResult::from_runtime(0), SuccessfulRun());
EXPECT_THAT(ckt::RunResult::not_supported("test error"), Not(SuccessfulRun()));
}
TEST(RunResultMatcher, ExplainMatchResult)
{
testing::StringMatchResultListener listener;
EXPECT_TRUE(!ExplainMatchResult(
SuccessfulRun(), ckt::RunResult::not_supported("test error"), &listener));
EXPECT_THAT(listener.str(), StringEqWithDiff("run failed: test error"));
}

View File

@@ -339,4 +339,22 @@ void HipStatusMatcher::DescribeNegationTo(std::ostream* os) const
return ::testing::MakeMatcher(new HipStatusMatcher(error));
}
bool RunResultMatcher::MatchAndExplain(builder::test::RunResult actual,
::testing::MatchResultListener* listener) const
{
if(actual.error.has_value() && listener)
*listener << "run failed: " << actual.error.value();
return actual.is_supported();
}
void RunResultMatcher::DescribeTo(std::ostream* os) const { *os << "successful run"; }
void RunResultMatcher::DescribeNegationTo(std::ostream* os) const { *os << "unsuccessful run"; }
::testing::Matcher<builder::test::RunResult> SuccessfulRun()
{
return ::testing::MakeMatcher(new RunResultMatcher());
}
} // namespace ck_tile::test

View File

@@ -161,6 +161,23 @@ struct HipStatusMatcher : public ::testing::MatcherInterface<hipError_t>
/// @param error The error to expect.
::testing::Matcher<hipError_t> HipError(hipError_t error);
/// @brief RunResult matcher
///
/// `ckt::run` returns a RunResult which indicates whether there was any
/// problem while running the algorithm. This matcher is used to match those
/// values.
struct RunResultMatcher : public ::testing::MatcherInterface<builder::test::RunResult>
{
bool MatchAndExplain(builder::test::RunResult actual,
::testing::MatchResultListener* listener) const override;
void DescribeTo(std::ostream* os) const override;
void DescribeNegationTo(std::ostream* os) const override;
};
/// @brief Construct a Google Test matcher that checks that a ckt::run result
/// was successful.
::testing::Matcher<builder::test::RunResult> SuccessfulRun();
template <auto SIGNATURE>
struct ReferenceOutputMatcher
: public ::testing::MatcherInterface<builder::test::Outputs<SIGNATURE>>
@@ -180,6 +197,21 @@ struct ReferenceOutputMatcher
if(listener->IsInterested() && !errors.empty())
{
*listener << errors.size() << " tensors failed to validate";
for(const auto& e : errors)
{
*listener << "\n - " << e.tensor_name << ": ";
if(e.is_all_zero())
*listener << "all elements in actual and expected tensors are zero";
else
{
// Round to 2 digits
const float percentage = e.wrong_elements * 10000 / e.total_elements / 100.f;
*listener << e.wrong_elements << "/" << e.total_elements
<< " incorrect elements (~" << percentage << "%)";
}
}
}
return errors.empty();

View File

@@ -3,7 +3,7 @@
#include "impl/conv_signature_types.hpp"
#include "testing_utils.hpp"
#include "ck_tile/builder/testing/conv_fwd.hpp"
#include "ck_tile/builder/testing/conv/fwd.hpp"
#include "ck_tile/builder/testing/tensor_foreach.hpp"
#include <gtest/gtest.h>
#include <gmock/gmock.h>

View File

@@ -296,5 +296,8 @@ TEST(MatchesReference, Incorrect)
testing::StringMatchResultListener listener;
EXPECT_TRUE(!ExplainMatchResult(MatchesReference(args, expected), actual, &listener));
EXPECT_THAT(listener.str(), StringEqWithDiff("1 tensors failed to validate"));
EXPECT_THAT(listener.str(),
StringEqWithDiff( //
"1 tensors failed to validate\n"
" - a: 625/625 incorrect elements (~100%)"));
}

View File

@@ -20,9 +20,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stri
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>

View File

@@ -20,9 +20,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stri
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>

View File

@@ -20,9 +20,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stri
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>

View File

@@ -20,9 +20,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stri
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>

View File

@@ -20,9 +20,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stri
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>

View File

@@ -20,9 +20,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stri
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>

View File

@@ -1,5 +1,6 @@
#include "../../builder/test/utils/ckb_conv_tile_test_configs.hpp"
#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp"
#include "ck_tile/builder/testing/conv/fwd.hpp"
#include "ck_tile/builder/testing/conv/ck_tile.hpp"
namespace ckb = ck_tile::builder;
namespace ckt = ck_tile::builder::test;

View File

@@ -2,8 +2,6 @@
using Builder = ckb::ConvBuilder<SIGNATURE, ALGORITHM>;
using Instance = Builder::Instance;
auto conv = Instance{};
bool is_supported;
float avg_time;
std::tie(is_supported, avg_time) = ckt::run(conv, args, inputs, outputs, s_conf);
return std::make_tuple(is_supported, avg_time, conv.GetInstanceString());
auto conv = Instance{};
ckt::RunResult result = ckt::run(conv, args, inputs, outputs, s_conf);
return std::make_tuple(result.is_supported(), result.runtime, conv.GetInstanceString());

View File

@@ -0,0 +1,225 @@
# Build Time Optimization
Tracking issue: [#3575](https://github.com/ROCm/composable_kernel/issues/3575)
This document describes techniques for reducing C++ template instantiation overhead in the Composable Kernel codebase.
## Why Build Time Matters
Composable Kernel relies heavily on C++ template metaprogramming to achieve GPU kernels with no runtime abstraction penalty. However, deep template instantiation can significantly impact build times. A single translation unit may trigger hundreds of thousands of template instantiations, with each instantiation adding to compile time.
## Key Types
This codebase uses compile-time types to enable zero-overhead abstractions:
- `Number<N>` - compile-time integer, enables static dispatch and compile-time arithmetic
- `Sequence<Is...>` - compile-time integer sequence, used for dimension ordering and index manipulation
- `Tuple<Ts...>` - heterogeneous container holding different types, used for tensor descriptors and transforms
These types allow the compiler to fully unroll loops, eliminate branches, and inline all operations - producing GPU kernels with no runtime abstraction cost.
## Optimization Techniques
### 1. Replace Recursive Templates with Pack Expansion
Recursive template patterns create O(N) instantiation depth - the compiler must instantiate each level before proceeding to the next:
```
sequence_gen_impl<5, F>
→ sequence_gen_impl<4, F>
→ sequence_gen_impl<3, F>
→ ...
```
Using `__make_integer_seq` (Clang/MSVC) combined with pack expansion reduces this to constant depth - the compiler generates the entire sequence in one step internally, without recursive template instantiation.
**Before** (O(N) recursive instantiation):
```cpp
template <index_t N, typename F, index_t... Is>
struct sequence_gen_impl
{
using type = typename sequence_gen_impl<N-1, F, F{}(Number<N-1>{}), Is...>::type;
};
template <typename F, index_t... Is>
struct sequence_gen_impl<0, F, Is...>
{
using type = Sequence<Is...>;
};
```
**After** (constant depth using compiler intrinsic + pack expansion):
```cpp
namespace detail {
template <typename T, T... Is>
struct sequence_gen_helper
{
// Apply functor F to all indices via pack expansion
// F{}(Number<0>{}), F{}(Number<1>{}), ..., F{}(Number<N-1>{})
template <typename F>
using apply = Sequence<F{}(Number<Is>{})...>;
};
} // namespace detail
template <index_t N, typename F>
struct sequence_gen
{
// __make_integer_seq<sequence_gen_helper, index_t, N> produces
// sequence_gen_helper<index_t, 0, 1, ..., N-1> with constant depth
using type =
typename __make_integer_seq<detail::sequence_gen_helper, index_t, N>::template apply<F>;
};
```
Note: This document assumes C++17 or later. While `std::make_integer_sequence` (introduced in C++14) is the standard library facility for generating integer sequences, it only produces `std::integer_sequence<T, ...>`. We use `__make_integer_seq` directly because it accepts any template as its first argument, enabling this pattern where the helper class receives the index pack directly.
### 2. Replace Lambdas with Named Functors
Each lambda expression creates a unique closure type, causing separate template instantiations at every call site. Named functors share a single type across all uses.
**Before** (lambda creates unique instantiations at each call site):
```cpp
// The lambda inside transform_tensor_descriptor:
generate_tuple([](auto i) { return Sequence<i>{}; }, Number<N>{});
```
**After** (named functor shares instantiations):
```cpp
// Define functor once
struct generate_identity_sequence
{
template <index_t I>
__host__ __device__ constexpr auto operator()(Number<I>) const
{
return Sequence<I>{};
}
};
// Use everywhere - shares instantiations
generate_tuple(generate_identity_sequence{}, Number<N>{});
```
This reduced `transform_tensor_descriptor` instantiations from 388 to 32 (92% reduction).
**Example: container_concat**
```cpp
// Before: lambda creates unique type per call site
// (unpack2 applies a functor to all elements from both tuples)
template <typename... X, typename... Y>
__host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
{
return unpack2([](auto&&... zs) { return make_tuple(forward<decltype(zs)>(zs)...); }, tx, ty);
}
// After: named functor shares instantiations
struct make_tuple_functor
{
template <typename... Ts>
__host__ __device__ constexpr auto operator()(Ts&&... xs) const
{
return make_tuple(forward<Ts>(xs)...);
}
};
template <typename... X, typename... Y>
__host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
{
return unpack2(make_tuple_functor{}, tx, ty);
}
```
This reduced `container_concat` instantiations from 186 to 93 (50% reduction).
**Example: make_uniform_tuple**
For patterns that create tuples with repeated values:
```cpp
// Before: unique lambda type at each call site
generate_tuple([](auto) { return some_value; }, Number<N>{});
// After: dedicated helper function
template <index_t N, typename T>
__host__ __device__ constexpr auto make_uniform_tuple(T&& value)
{
return detail::make_uniform_tuple_impl(static_cast<T&&>(value), make_index_sequence<N>{});
}
// Usage
make_uniform_tuple<N>(some_value);
```
### 3. Use Constexpr Loops Instead of Template Recursion
Template recursion creates N template instantiations for N iterations. A constexpr loop executes at compile time but only requires a single template instantiation. While both are O(N) in complexity, constexpr loops are significantly faster because they avoid the overhead of template instantiation.
**Before** (O(N) template instantiations):
```cpp
// Simplified example - actual CK code used more complex recursive patterns
template <index_t Target, typename Seq, index_t Pos, bool AtEnd>
struct find_source_index_impl
{
static constexpr index_t value =
(Seq::template At<Pos>() == Target) ? Pos : find_source_index_impl<Target, Seq, Pos+1, (Pos+1 == Seq::Size())>::value;
};
template <index_t Target, typename Seq, index_t Pos>
struct find_source_index_impl<Target, Seq, Pos, true>
{
static constexpr index_t value = -1; // not found
};
```
**After** (single instantiation with constexpr loop):
```cpp
template <index_t Target, index_t... Is>
__host__ __device__ constexpr index_t find_source_index(Sequence<Is...>)
{
// Simplified example - actual implementation handles empty sequences
constexpr index_t values[] = {Is...};
for(index_t i = 0; i < sizeof...(Is); ++i)
if(values[i] == Target) return i;
return -1; // not found
}
```
This reduced `sequence_map_inverse` instantiations from 45 to 10 (78% reduction) and wall-clock time by 95%.
### 4. Use Fold Expressions for Accumulation
Fold expressions (C++17) can replace recursive template patterns for accumulation operations.
**Before** (uses helper utilities that hide template recursion: `generate_tuple` recursively constructs a tuple of N elements, and `container_reduce` recursively reduces that tuple):
```cpp
const auto element_space_size = container_reduce(
generate_tuple([&](auto i) {
return (lengths[i] - Number<1>{}) * strides[i];
}, Number<N>{}),
math::plus{}, Number<1>{});
```
**After** (single fold expression):
```cpp
template <typename... Lengths, typename... Strides, index_t... Is>
__host__ __device__ constexpr auto compute_element_space_size(
const Tuple<Lengths...>& lengths,
const Tuple<Strides...>& strides,
Sequence<Is...>)
{
return (LongNumber<1>{} + ... +
((lengths[Number<Is>{}] - Number<1>{}) * strides[Number<Is>{}]));
}
```
This reduced `calculate_element_space_size` instantiations from 24 to 10 (58% reduction) and wall-clock time by 73%.

View File

@@ -55,9 +55,6 @@
#ifndef CK_ENABLE_FP32
#define CK_ENABLE_FP32 "ON"
#endif
#ifndef CK_ENABLE_TF32
#define CK_ENABLE_TF32 "ON"
#endif
#ifndef CK_ENABLE_FP64
#define CK_ENABLE_FP64 "ON"
#endif
@@ -88,10 +85,6 @@
#cmakedefine CK_ENABLE_FP32 @CK_ENABLE_FP32@
#endif
#ifndef CK_ENABLE_TF32
#cmakedefine CK_ENABLE_TF32 @CK_ENABLE_TF32@
#endif
#ifndef CK_ENABLE_FP64
#cmakedefine CK_ENABLE_FP64 @CK_ENABLE_FP64@
#endif

View File

@@ -139,5 +139,22 @@ inline bool is_tf32_supported()
return ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950";
}
inline int __host__ get_lds_size()
{
int device = 0;
int result = 0;
auto status = hipGetDevice(&device);
if(status == hipSuccess)
{
status = hipDeviceGetAttribute(&result, hipDeviceAttributeMaxSharedMemoryPerBlock, device);
if(status == hipSuccess)
{
return result;
}
}
return 64 * 1024;
}
} // namespace ck
#endif

View File

@@ -160,6 +160,7 @@ struct ThreadGroupTransferGlobal
// check if src element is valid
const bool is_src_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
oob_thread_scratch_.template SetAsType<bool>(vgpr_data_idx_seq, is_src_valid);
// Vector length of elementwise operation
constexpr auto get_elem_op_vec_len = []() {
@@ -195,14 +196,12 @@ struct ThreadGroupTransferGlobal
using dst_vector_type = vector_type_maker_t<DstData, VectorSize>;
using dst_vector_t = typename dst_vector_type::type;
using vector_t = typename vector_type_maker<DstData, VectorSize>::type::type;
dst_vector_type op_r_v;
// Load data from memory in src_vector first
src_vector_container src_vector =
src_vector_container{grid_buf.template Get<src_vector_container_t, DoTranspose>(
src_coord_.GetOffset(), true)};
auto index = is_src_valid || !DoTranspose ? src_coord_.GetOffset() : 0;
src_vector_container src_vector = src_vector_container{
grid_buf.template Get<src_vector_container_t, DoTranspose>(index, true)};
// apply the src elementwise op and convert to DstData under the hood if needed
static_for<0, VectorSize / elem_op_vec_len, 1>{}([&](auto idx) {
@@ -213,9 +212,8 @@ struct ThreadGroupTransferGlobal
// store result in dvgpr_ (static array holding loaded data).
// At this point data is already converted to DstData type and
// the elementwise operation has been applied
dvgpr_.template SetAsType<dst_vector_t>(
vgpr_data_idx_seq,
is_src_valid ? op_r_v.template AsType<dst_vector_t>()[I0] : vector_t(0));
src_dvgpr_.template SetAsType<dst_vector_t>(vgpr_data_idx_seq,
op_r_v.template AsType<dst_vector_t>()[I0]);
// For each dimension move fwd, bwd or don't move
static_for<0, nDim, 1>{}([&](auto i) {
@@ -248,6 +246,39 @@ struct ThreadGroupTransferGlobal
container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
constexpr auto ordered_fwd_step = StepsPerIteration{};
// OOB check
static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
// calculate src data index and make sequence
constexpr auto src_data_idx = [&]() {
Index ordered_idx;
static_for<0, nDim, 1>{}(
[&](auto i) { ordered_idx(i) = ordered_src_access_idx[i]; });
return container_reorder_given_old2new(ordered_idx, src_dim_access_order);
}();
// make sequence to access vgpr data. Add zero as last element of src_data_idx_seq
constexpr auto vgpr_data_idx_seq = generate_sequence_v2(
[&](auto i) {
if constexpr(i.value < src_data_idx.Size())
{
return Number<src_data_idx[i]>{};
}
else
{
return Number<0>{};
}
},
Number<src_data_idx.Size() + 1>{});
auto op_r = src_dvgpr_.template GetAsType<dst_vector_t>(vgpr_data_idx_seq);
const bool is_src_valid =
oob_thread_scratch_.template GetAsType<bool>(vgpr_data_idx_seq);
auto op_r_v = is_src_valid ? op_r : dst_vector_t(0);
dst_dvgpr_.template SetAsType<dst_vector_t>(vgpr_data_idx_seq, op_r_v);
});
// make forward steps
// forward step for each iteration just add 1
const auto dst_forward_steps = generate_tuple(
@@ -352,7 +383,7 @@ struct ThreadGroupTransferGlobal
dst_buf.template Set<dst_vector_t>(
dst_coord_.GetOffset(),
true,
dvgpr_.template GetAsType<dst_vector_t>(vgpr_data_idx_seq));
dst_dvgpr_.template GetAsType<dst_vector_t>(vgpr_data_idx_seq));
// For each dimension move fwd, bwd or don't move
static_for<0, nDim, 1>{}([&](auto i) {
@@ -389,6 +420,14 @@ struct ThreadGroupTransferGlobal
return make_naive_tensor_descriptor_packed(access_lengths_as_tuple);
}
__device__ static constexpr auto GetSrcThreadScratchDescriptor()
{
constexpr auto access_lengths_as_tuple =
container_push_back(sequence_to_tuple_of_number(NumberOfIterations{}), Number<1>{});
return make_naive_tensor_descriptor_packed(access_lengths_as_tuple);
}
static constexpr auto thread_data_scratch_desc_ = decltype(GetThreadScratchDataDescriptor()){};
using ThreadScratchData = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
DstData,
@@ -396,7 +435,17 @@ struct ThreadGroupTransferGlobal
decltype(thread_data_scratch_desc_),
true>;
ThreadScratchData dvgpr_;
static constexpr auto src_oob_thread_scratch_desc_ =
decltype(GetSrcThreadScratchDescriptor()){};
using OOBThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
bool,
1,
decltype(src_oob_thread_scratch_desc_),
true>;
ThreadScratchData src_dvgpr_;
ThreadScratchData dst_dvgpr_;
OOBThreadScratch oob_thread_scratch_;
SrcCoord src_coord_;
DstCoord dst_coord_;
const ElementwiseOperation element_op_;

View File

@@ -132,7 +132,7 @@ struct ThreadGroupTensorSliceTransfer_v7r3
}
template <typename T>
using is_tuple = decltype(std::declval<T&>().IsTuple());
using is_tuple = decltype(declval<T&>().IsTuple());
template <typename DstBuffers, index_t ThreadScratchId = 0>
__device__ void RunWrite(const DstDescs& dst_descs,

View File

@@ -144,7 +144,7 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter
}
template <typename T>
using is_tuple = decltype(std::declval<T&>().IsTuple());
using is_tuple = decltype(declval<T&>().IsTuple());
template <typename DstBuffers, index_t ThreadScratchId = 0>
__device__ void RunWrite(const DstDescs& dst_descs,

View File

@@ -107,7 +107,7 @@ __device__ void device_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
const auto& ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DsPointer p_ds_grid_grp;

View File

@@ -833,6 +833,26 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle_V3
return false;
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityAWaveTransfer(arg.M, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix A" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityBWaveTransfer(arg.N, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix B" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
// check vector access
static_assert((ABlockTransferSrcVectorDim == 1 || ABlockTransferSrcVectorDim == 2) &&
(BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2),

View File

@@ -36,7 +36,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_contraction_multiple_d_xdl_cshuffle(
const FloatAB* __restrict__ p_a_grid,
@@ -59,7 +59,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t num_blocks_per_batch =
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);

View File

@@ -3,77 +3,21 @@
#pragma once
#include <iostream>
#include <sstream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3_common.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/utility/tuple.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <typename DeviceOp, typename GridwiseOp, bool HasMainKBlockLoop, TailNumber TailNum>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_gemm_wmma_cshuffle_v3(typename DeviceOp::RawArg arg)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
__shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()];
const index_t num_blocks_per_batch =
__builtin_amdgcn_readfirstlane(get_grid_size() / arg.batch_count);
const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
const long_index_t a_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetABasePtr(g_idx)));
const long_index_t b0_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetB0BasePtr(g_idx)));
const long_index_t b1_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
const long_index_t c_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
GridwiseOp::template Run<HasMainKBlockLoop, TailNum>(
arg.p_a_grid + a_batch_offset,
arg.p_b0_grid + b0_batch_offset,
Tuple<>{}, // p_d0s_grid
arg.p_b1_grid + b1_batch_offset,
Tuple<>{}, // p_d1s_grid
arg.p_c_grid + c_batch_offset,
p_shared,
arg.a_grid_desc,
arg.b0_grid_desc,
Tuple<>{}, // D0sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
arg.b1_grid_desc,
Tuple<>{}, // D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
arg.c_grid_desc_mblock_mperblock_nblock_nperblock,
arg.a_element_op,
arg.b0_element_op,
arg.acc_element_op,
arg.b1_element_op,
arg.c_element_op,
arg.block_2_ctile_map);
#else
ignore = arg;
#endif // (!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__)
}
// Computes C = A * B0 * B1
// MN = MK * KL * LN
// ^^^^^^ (Acc0)
@@ -157,88 +101,47 @@ struct DeviceBatchedGemmGemm_Wmma_CShuffleV3 : public DeviceBatchedGemmGemm<ALay
// to LPerWmma (A.k.a Gemm0 NPerWmma).
static constexpr index_t NPerWmma = LPerWmma;
// TODO: Now that we are no longer using NumDim or TensorSpec, we can probably use a simpler
// Transform operator or just not use one at all.
using Transform = TransformBatchedContractionContractionToBatchedGemmGemm_Wmma<
Sequence<1, 1, 1, 1, 1>,
Sequence<MPerBlock, LPerBlock, KPerBlock, NPerBlock>,
GemmSpec,
TensorSpecialization::Default, // ASpec
TensorSpecialization::Default, // B0Spec
TensorSpecialization::Default, // B1Spec
TensorSpecialization::Default>; // CSpec
__host__ __device__ static auto
MakeAGridDescriptor(const std::array<index_t, 3>& a_g_m_k_lengths_vec,
const std::array<index_t, 3>& a_g_m_k_strides_vec)
{
return Transform::MakeAGridDescriptor_AK0_M_AK1(
Transform::MakeAGridDescriptor_M_K(a_g_m_k_lengths_vec, a_g_m_k_strides_vec),
Number<AK1>{});
}
__host__ __device__ static auto
MakeB0GridDescriptor(const std::array<index_t, 3>& b0_g_l_k_lengths_vec,
const std::array<index_t, 3>& b0_g_l_k_strides_vec)
{
return Transform::MakeB0GridDescriptor_BK0_N_BK1(
Transform::MakeB0GridDescriptor_N_K(b0_g_l_k_lengths_vec, b0_g_l_k_strides_vec),
Number<BK1>{});
}
__host__ __device__ static auto
MakeB1GridDescriptor(const std::array<index_t, 3>& b1_g_n_l_lengths_vec,
const std::array<index_t, 3>& b1_g_n_l_strides_vec)
{
return Transform::MakeB1GridDescriptor_BK0_N_BK1(
Transform::MakeB1GridDescriptor_N_K(b1_g_n_l_lengths_vec, b1_g_n_l_strides_vec),
Number<L1>{});
}
using AGridDesc = decltype(MakeAGridDescriptor({}, {}));
using B0GridDesc = decltype(MakeB0GridDescriptor({}, {}));
using B1GridDesc = decltype(MakeB1GridDescriptor({}, {}));
using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {}));
struct ComputeBasePtrOfStridedBatch
{
ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
index_t BatchStrideB0,
index_t BatchStrideB1,
index_t BatchStrideC)
: BatchStrideA_(BatchStrideA),
BatchStrideB0_(BatchStrideB0),
BatchStrideB1_(BatchStrideB1),
BatchStrideC_(BatchStrideC)
{
}
__host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideA_);
}
__host__ __device__ constexpr long_index_t GetB0BasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideB0_);
}
__host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideB1_);
}
__host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideC_);
}
private:
index_t BatchStrideA_;
index_t BatchStrideB0_;
index_t BatchStrideB1_;
index_t BatchStrideC_;
};
using DeviceGemmGemmCommonBase =
DeviceGemmGemm_Wmma_CShuffleV3_Common<DeviceOp,
GemmSpec,
ALayout,
B0layout,
Tuple<>, // D0sLayout
B1Layout,
Tuple<>, // D1sLayout
CLayout,
BlockSize,
MPerBlock,
LPerBlock,
KPerBlock,
NPerBlock,
ADataType,
B0DataType,
B1DataType,
AccDataType,
CDataType,
Tuple<>, // D0sDataType
Tuple<>, // D1sDataType
AElementwiseOperation,
B0ElementwiseOperation,
AccElementwiseOperation,
B1ElementwiseOperation,
CElementwiseOperation,
AK1,
BK1,
L1,
MPerWmma,
LPerWmma,
BlkGemmPipelineVer,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
B0BlockTransferSrcVectorDim,
B0BlockTransferSrcScalarPerVector,
B1BlockTransferSrcVectorDim,
B1BlockTransferSrcScalarPerVector,
ck::index_t{}, // CDE0BlockTransferSrcScalarPerVector
CShuffleBlockTransferScalarPerVector_NPerBlock,
false>; // IsMultiD
// GridwiseOp
using GridwiseOp = GridwiseBatchedGemmGemm_wmma_cshuffle_v3<
@@ -260,12 +163,12 @@ struct DeviceBatchedGemmGemm_Wmma_CShuffleV3 : public DeviceBatchedGemmGemm<ALay
CElementwiseOperation,
InMemoryDataOperationEnum::Set,
// InMemory Data Descriptor
AGridDesc,
B0GridDesc,
typename DeviceGemmGemmCommonBase::AGridDesc,
typename DeviceGemmGemmCommonBase::B0GridDesc,
Tuple<>, // Ds0GridDesc
B1GridDesc,
typename DeviceGemmGemmCommonBase::B1GridDesc,
Tuple<>, // Ds1GridDesc
CGridDesc_M_N,
typename DeviceGemmGemmCommonBase::CGridDesc_M_N,
// Tiling Family
MPerBlock,
LPerBlock,
@@ -312,339 +215,67 @@ struct DeviceBatchedGemmGemm_Wmma_CShuffleV3 : public DeviceBatchedGemmGemm<ALay
CShuffleNRepeatPerShuffle,
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
CShuffleBlockTransferScalarPerVector_NPerBlock,
Transform::matrix_padder.PadN,
DeviceGemmGemmCommonBase::GridDescriptorCreator::Transform::matrix_padder.PadN,
BlkGemmPipeSched,
BlkGemmPipelineVer>;
struct RawArg : public BaseArgument
using DeviceGemmGemmCommon = DeviceGemmGemm_Wmma_CShuffleV3_Common_Invoker_Arg<
DeviceOp,
GemmSpec,
ALayout,
B0layout,
Tuple<>, // D0sLayout
B1Layout,
Tuple<>, // D1sLayout
CLayout,
BlockSize,
MPerBlock,
LPerBlock,
KPerBlock,
NPerBlock,
ADataType,
B0DataType,
B1DataType,
AccDataType,
CDataType,
Tuple<>, // D0sDataType,
Tuple<>, // D1sDataType,
AElementwiseOperation,
B0ElementwiseOperation,
AccElementwiseOperation,
B1ElementwiseOperation,
CElementwiseOperation,
AK1,
BK1,
L1,
MPerWmma,
LPerWmma,
BlkGemmPipelineVer,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
B0BlockTransferSrcVectorDim,
B0BlockTransferSrcScalarPerVector,
B1BlockTransferSrcVectorDim,
B1BlockTransferSrcScalarPerVector,
ck::index_t{}, // CDE0BlockTransferSrcScalarPerVector
CShuffleBlockTransferScalarPerVector_NPerBlock,
false>; // IsMultiD
// Invoker
using Invoker = typename DeviceGemmGemmCommon::Invoker;
// Argument
using Argument = typename DeviceGemmGemmCommon::Argument;
static bool IsSupportedArgument(const Argument& arg)
{
using arr3 = std::array<ck::index_t, 3>;
RawArg(const ADataType* p_a_grid_,
const B0DataType* p_b0_grid_,
const B1DataType* p_b1_grid_,
CDataType* p_c_grid_,
index_t M_,
index_t N_,
index_t K_,
index_t O_,
index_t Batch,
index_t StrideA,
index_t StrideB0,
index_t StrideB1,
index_t StrideC,
index_t BatchStrideA,
index_t BatchStrideB0,
index_t BatchStrideB1,
index_t BatchStrideC,
AElementwiseOperation a_element_op_,
B0ElementwiseOperation b0_element_op_,
AccElementwiseOperation acc_element_op_,
B1ElementwiseOperation b1_element_op_,
CElementwiseOperation c_element_op_)
: p_a_grid{p_a_grid_},
p_b0_grid{p_b0_grid_},
p_b1_grid{p_b1_grid_},
p_c_grid{p_c_grid_},
M{M_},
N{N_},
K{K_},
O{O_},
batch_count{Batch},
a_element_op{a_element_op_},
b0_element_op{b0_element_op_},
acc_element_op{acc_element_op_},
b1_element_op{b1_element_op_},
c_element_op{c_element_op_},
compute_base_ptr_of_batch{BatchStrideA, BatchStrideB0, BatchStrideB1, BatchStrideC}
{
a_g_m_k_lengths = arr3{batch_count, M, K};
a_g_m_k_strides = arr3{BatchStrideA, StrideA, 1}; // A layout [batch_count, M, K]
b0_g_n_k_lengths = arr3{batch_count, N, K};
b0_g_n_k_strides = arr3{BatchStrideB0, StrideB0, 1}; // B0 layout [batch_count, N, K]
b1_g_o_n_lengths = arr3{batch_count, O, N};
b1_g_o_n_strides =
is_same_v<B1Layout, tensor_layout::gemm::RowMajor>
? arr3{BatchStrideB1, 1, StrideB1} // B1 layout [batch_count, N, O]
: arr3{BatchStrideB1, StrideB1, 1}; // B1 layout [batch_count, O, N]
c_g_m_o_lengths = arr3{batch_count, M, O};
c_g_m_o_strides = arr3{BatchStrideC, StrideC, 1}; // C layout [batch_count, M, O]
a_grid_desc = MakeAGridDescriptor(a_g_m_k_lengths, a_g_m_k_strides);
b0_grid_desc = MakeB0GridDescriptor(b0_g_n_k_lengths, b0_g_n_k_strides);
b1_grid_desc = MakeB1GridDescriptor(b1_g_o_n_lengths, b1_g_o_n_strides);
c_grid_desc_m_n = Transform::MakeCGridDescriptor_M_N(c_g_m_o_lengths, c_g_m_o_strides);
c_grid_desc_mblock_mperblock_nblock_nperblock =
GridwiseOp::MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
block_2_ctile_map = GridwiseOp::MakeDefaultBlock2ETileMap(c_grid_desc_m_n, 1, 1);
}
// Pointers
const ADataType* p_a_grid;
const B0DataType* p_b0_grid;
const B1DataType* p_b1_grid;
CDataType* p_c_grid;
// Raw Problem Size
index_t M;
index_t N;
index_t K;
index_t O;
index_t batch_count;
arr3 a_g_m_k_lengths;
arr3 a_g_m_k_strides;
arr3 b0_g_n_k_lengths;
arr3 b0_g_n_k_strides;
arr3 b1_g_o_n_lengths;
arr3 b1_g_o_n_strides;
arr3 c_g_m_o_lengths;
arr3 c_g_m_o_strides;
AElementwiseOperation a_element_op;
B0ElementwiseOperation b0_element_op;
AccElementwiseOperation acc_element_op;
B1ElementwiseOperation b1_element_op;
CElementwiseOperation c_element_op;
// Grid descriptors and other mem calculators
AGridDesc a_grid_desc;
B0GridDesc b0_grid_desc;
B1GridDesc b1_grid_desc;
CGridDesc_M_N c_grid_desc_m_n;
typename GridwiseOp::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
c_grid_desc_mblock_mperblock_nblock_nperblock;
typename GridwiseOp::DefaultBlock2ETileMap block_2_ctile_map;
ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch;
};
static bool IsSupportedArgument([[maybe_unused]] const RawArg& arg)
{
// Print lambda with env check and printf() style formmating.
const char* curFunc = __func__;
auto print = [&curFunc](const char* format, ...) -> void {
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wformat-nonliteral"
#endif
va_list args;
va_start(args, format);
std::vfprintf(stdout, format, args);
va_end(args);
#if defined(__clang__)
#pragma clang diagnostic pop
#endif
std::cout << "In file: " << __FILE__ << ", function: " << curFunc << "\n";
}
};
if(!(ck::is_gfx11_supported() || ck::is_gfx12_supported()))
{
print("DeviceOp: Arch err\n");
return false;
}
if constexpr(std::is_same_v<ADataType, f8_t> || std::is_same_v<ADataType, bf8_t> ||
std::is_same_v<B0DataType, f8_t> || std::is_same_v<B0DataType, bf8_t> ||
std::is_same_v<B1DataType, f8_t> || std::is_same_v<B1DataType, bf8_t>)
{
if(ck::is_gfx11_supported())
{
print("DeviceOp: gfx 11 does not support fp8\n");
return false;
}
}
if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
{
print("DeviceOp: Acc0 Type err\n");
return false;
}
if constexpr(!(is_same_v<ALayout, tensor_layout::gemm::RowMajor>))
{
print("DeviceOp: A layout must be Row\n");
return false;
}
if constexpr(!(is_same_v<B0layout, tensor_layout::gemm::ColumnMajor>))
{
print("DeviceOp: B layout must be Column\n");
return false;
}
if constexpr(!(is_same_v<B1Layout, tensor_layout::gemm::RowMajor> ||
is_same_v<B1Layout, tensor_layout::gemm::ColumnMajor>))
{
print("DeviceOp: B1 layout must be Column or Row\n");
return false;
}
if constexpr(!(is_same_v<CLayout, tensor_layout::gemm::RowMajor>))
{
print("DeviceOp: C layout must be Row\n");
return false;
}
// Other padding modes have not been tested and do not get checked individually.
if constexpr(GemmSpec != GemmSpecialization::Default &&
GemmSpec != GemmSpecialization::MNKOPadding)
{
print("Padding mode must be default or MNKO\n");
return false;
}
// Per wmma dimensions not equal to 16 are very untested.
if constexpr(MPerWmma != 16 || LPerWmma != 16 || NPerWmma != 16)
{
print("M, L, N per Wmma must be 16\n");
return false;
}
if(!GridwiseOp::CheckValidity(arg.a_grid_desc,
arg.b0_grid_desc,
Tuple<>{},
arg.b1_grid_desc,
Tuple<>{},
arg.c_grid_desc_m_n,
arg.block_2_ctile_map))
{
return false;
}
// Check scalar per vector requirement
const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? arg.K : arg.M;
const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? arg.K : arg.N;
const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? arg.N : arg.O;
const auto c_extent_lowest = arg.O;
if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
{
print("DeviceOp: Data Transfer Vector scalar err\n");
return false;
}
// Check vector load/store requirement
const auto a_stride_lowest =
ABlockTransferSrcVectorDim == 2 ? arg.a_g_m_k_strides[2] : arg.a_g_m_k_strides[1];
const auto b0_stride_lowest =
B0BlockTransferSrcVectorDim == 2 ? arg.b0_g_n_k_strides[2] : arg.b0_g_n_k_strides[1];
const auto b1_stride_lowest =
B1BlockTransferSrcVectorDim == 2 ? arg.b1_g_o_n_strides[2] : arg.b1_g_o_n_strides[1];
const auto c_stride_lowest = arg.c_g_m_o_strides[2];
if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
c_stride_lowest == 1))
{
print("DeviceOp: Data Vectorize transfer err\n");
return false;
}
if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MNKOPadding))
{
return false;
}
return true;
return DeviceGemmGemmCommon::IsSupportedArgument(arg);
}
// polymorphic
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
return IsSupportedArgument(*dynamic_cast<const RawArg*>(p_arg));
return DeviceGemmGemmCommon::IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
}
struct Invoker : public BaseInvoker
{
using Argument = DeviceOp::RawArg;
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto M0 = math::integer_divide_ceil(arg.M, MPerBlock);
const auto N0 = math::integer_divide_ceil(arg.O, NPerBlock);
const index_t grid_size = arg.batch_count * M0 * N0;
auto launch_kernel = [&](auto has_main_k_block_loop, auto tail_number) {
constexpr bool has_loop = decltype(has_main_k_block_loop)::value;
constexpr TailNumber tn = tail_number;
const auto kernel =
kernel_batched_gemm_gemm_wmma_cshuffle_v3<DeviceOp, GridwiseOp, has_loop, tn>;
return launch_and_time_kernel(
stream_config, kernel, dim3(grid_size), dim3(BlockSize), 0, arg);
};
bool HasMainKBlockLoop = GridwiseOp::CalculateHasMainKBlockLoop(arg.K);
TailNumber TailNum = GridwiseOp::CalculateKBlockLoopTailNum(arg.K);
if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
{
if(HasMainKBlockLoop && TailNum == TailNumber::Full)
{
return launch_kernel(std::integral_constant<bool, true>{},
std::integral_constant<TailNumber, TailNumber::Full>{});
}
else if(!HasMainKBlockLoop && TailNum == TailNumber::Full)
{
return launch_kernel(std::integral_constant<bool, false>{},
std::integral_constant<TailNumber, TailNumber::Full>{});
}
else
{
printf("Invalid HasMainKBlockLoop and TailNum combination for V1!\n");
return 0.0f;
}
}
else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
{
if(HasMainKBlockLoop && TailNum == TailNumber::Full)
{
return launch_kernel(std::integral_constant<bool, true>{},
std::integral_constant<TailNumber, TailNumber::Full>{});
}
else if(!HasMainKBlockLoop && TailNum == TailNumber::Even)
{
return launch_kernel(std::integral_constant<bool, false>{},
std::integral_constant<TailNumber, TailNumber::Even>{});
}
else if(!HasMainKBlockLoop && TailNum == TailNumber::Odd)
{
return launch_kernel(std::integral_constant<bool, false>{},
std::integral_constant<TailNumber, TailNumber::Odd>{});
}
else
{
printf("Invalid HasMainKBlockLoop and TailNum combination for V3!\n");
return 0.0f;
}
}
else
{
printf("Invalid pipeline version!\n");
return 0.0f;
}
}
// polymorphic
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
// polymorphic
std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
const void* p_b0,
@@ -669,28 +300,39 @@ struct DeviceBatchedGemmGemm_Wmma_CShuffleV3 : public DeviceBatchedGemmGemm<ALay
B1ElementwiseOperation b1_element_op,
CElementwiseOperation c_element_op) override
{
return std::make_unique<RawArg>(static_cast<const ADataType*>(p_a),
static_cast<const B0DataType*>(p_b0),
static_cast<const B1DataType*>(p_b1),
static_cast<CDataType*>(p_c),
M,
N,
K,
O,
Batch,
StrideA,
StrideB0,
StrideB1,
StrideC,
BatchStrideA,
BatchStrideB0,
BatchStrideB1,
BatchStrideC,
a_element_op,
b0_element_op,
acc_element_op,
b1_element_op,
c_element_op);
std::array<const void*, DeviceGemmGemmCommonBase::NumD0Tensor> p_d0_grid{};
std::array<const void*, DeviceGemmGemmCommonBase::NumD1Tensor> p_d1_grid{};
std::array<index_t, DeviceGemmGemmCommonBase::NumD0Tensor> StrideD0s{}, BatchStrideD0s{};
std::array<index_t, DeviceGemmGemmCommonBase::NumD1Tensor> StrideD1s, BatchStrideD1s{};
return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
static_cast<const B0DataType*>(p_b0),
p_d0_grid,
static_cast<const B1DataType*>(p_b1),
p_d1_grid,
static_cast<CDataType*>(p_c),
M,
N,
K,
O,
Batch,
StrideA,
StrideB0,
StrideD0s,
StrideB1,
StrideD1s,
StrideC,
BatchStrideA,
BatchStrideB0,
BatchStrideD0s,
BatchStrideB1,
BatchStrideD1s,
BatchStrideC,
a_element_op,
b0_element_op,
acc_element_op,
b1_element_op,
c_element_op);
}
static auto MakeInvoker() { return Invoker{}; }

View File

@@ -0,0 +1,902 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <cstdint>
#include <iostream>
#include <cstdarg>
#include <type_traits>
#include <utility>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp"
#include "ck/utility/scheduler_enum.hpp"
#include "ck/utility/integral_constant.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <typename DeviceOp,
typename GridwiseOp,
bool HasMainKBlockLoop,
TailNumber TailNum,
bool IsMultiD>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_gemm_wmma_cshuffle_v3(typename DeviceOp::Argument arg)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
__shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()];
const index_t num_blocks_per_batch =
__builtin_amdgcn_readfirstlane(get_grid_size() / arg.batch_count);
const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
const long_index_t a_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetABasePtr(g_idx)));
const long_index_t b0_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetB0BasePtr(g_idx)));
const long_index_t b1_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
const long_index_t c_e1_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetCE1BasePtr(g_idx)));
auto [p_d0s_grid, p_d1s_grid] = [&]() {
if constexpr(IsMultiD)
{
auto create_grid = [](auto NumTensor, auto func, auto& arg_grid, auto&& grid_pointer) {
static_for<0, decltype(NumTensor)::value, 1>{}([&](auto In) {
const long_index_t batch_offset = __builtin_amdgcn_readfirstlane(func(In));
grid_pointer(In) = arg_grid(In) + batch_offset;
});
return std::move(grid_pointer);
};
auto get_d0_base_ptr = [&arg, &g_idx](auto d_idx) {
return arg.compute_base_ptr_of_batch.GetD0BasePtr(g_idx, d_idx);
};
auto get_d1_base_ptr = [&arg, &g_idx](auto d_idx) {
return arg.compute_base_ptr_of_batch.GetD1BasePtr(g_idx, d_idx);
};
auto d0s_grid = create_grid(ck::integral_constant<ck::index_t, DeviceOp::NumD0Tensor>{},
get_d0_base_ptr,
arg.p_d0s_grid,
GridwiseOp::MakeD0sGridPointer());
auto d1s_grid = create_grid(ck::integral_constant<ck::index_t, DeviceOp::NumD1Tensor>{},
get_d1_base_ptr,
arg.p_d1s_grid,
GridwiseOp::MakeD1sGridPointer());
return std::make_pair(d0s_grid, d1s_grid);
}
else
{
return std::make_pair(Tuple<>{}, Tuple<>{});
}
}();
GridwiseOp::template Run<HasMainKBlockLoop, TailNum>(
arg.p_a_grid + a_batch_offset,
arg.p_b0_grid + b0_batch_offset,
p_d0s_grid,
arg.p_b1_grid + b1_batch_offset,
p_d1s_grid,
arg.p_c_e1_grid + c_e1_batch_offset,
p_shared,
arg.a_grid_desc,
arg.b0_grid_desc,
arg.d0s_grid_desc,
arg.b1_grid_desc,
arg.d1s_grid_desc_mblock_mperblock_nblock_nperblock,
arg.c_e1_grid_desc_mblock_mperblock_nblock_nperblock,
arg.a_element_op,
arg.b0_element_op,
arg.acc_element_op,
arg.b1_element_op,
arg.cde1_element_op,
arg.block_2_etile_map);
#else
ignore = arg;
#endif // (!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__)
}
template <typename DeviceOp,
GemmSpecialization GemmSpec,
typename ALayout,
typename B0layout,
typename D0sLayout,
typename B1Layout,
typename D1sLayout,
typename CE1Layout,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t LPerBlock, // Gemm0NPerBlock
ck::index_t KPerBlock, // Gemm0KPerBlock
ck::index_t NPerBlock, // Gemm1NPerBlock
typename ADataType,
typename B0DataType,
typename B1DataType,
typename AccDataType,
typename CE1DataType,
typename D0sDataType,
typename D1sDataType,
typename AElementwiseOperation,
typename B0ElementwiseOperation,
typename AccElementwiseOperation,
typename B1ElementwiseOperation,
typename CDE1ElementwiseOperation,
ck::index_t AK1,
ck::index_t BK1,
ck::index_t L1, // B1K1
ck::index_t MPerWmma, // Gemm0/1 MPerWmma
ck::index_t LPerWmma, // Gemm0/1 NPerWmma
BlockGemmPipelineVersion BlkGemmPipelineVer,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t B0BlockTransferSrcVectorDim,
ck::index_t B0BlockTransferSrcScalarPerVector,
ck::index_t B1BlockTransferSrcVectorDim,
ck::index_t B1BlockTransferSrcScalarPerVector,
ck::index_t CDE0BlockTransferSrcScalarPerVector,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
bool IsMultiD = false>
struct DeviceGemmGemm_Wmma_CShuffleV3_Common
{
static constexpr ck::index_t NumD0Tensor = []() {
if constexpr(IsMultiD)
{
return DeviceOp::NumD0Tensor;
}
return 0;
}();
static constexpr ck::index_t NumD1Tensor = []() {
if constexpr(IsMultiD)
{
return DeviceOp::NumD1Tensor;
}
return 0;
}();
struct GridDescriptorCreator
{
// TODO: Now that we are no longer using NumDim or TensorSpec, we can probably use a simpler
// Transform operator or just not use one at all.
using Transform = TransformBatchedContractionContractionToBatchedGemmGemm_Wmma<
Sequence<1, 1, 1, 1, 1>,
Sequence<MPerBlock, LPerBlock, KPerBlock, NPerBlock>,
GemmSpec,
TensorSpecialization::Default, // ASpec
TensorSpecialization::Default, // B0Spec
TensorSpecialization::Default, // B1Spec
TensorSpecialization::Default>; // CSpec
__host__ __device__ static auto
MakeAGridDescriptor(const std::array<index_t, 3>& a_g_m_k_lengths_vec,
const std::array<index_t, 3>& a_g_m_k_strides_vec)
{
return Transform::MakeAGridDescriptor_AK0_M_AK1(
Transform::MakeAGridDescriptor_M_K(a_g_m_k_lengths_vec, a_g_m_k_strides_vec),
Number<AK1>{});
}
__host__ __device__ static auto
MakeB0GridDescriptor(const std::array<index_t, 3>& b0_g_l_k_lengths_vec,
const std::array<index_t, 3>& b0_g_l_k_strides_vec)
{
return Transform::MakeB0GridDescriptor_BK0_N_BK1(
Transform::MakeB0GridDescriptor_N_K(b0_g_l_k_lengths_vec, b0_g_l_k_strides_vec),
Number<BK1>{});
}
__host__ __device__ static auto
MakeB1GridDescriptor(const std::array<index_t, 3>& b1_g_n_l_lengths_vec,
const std::array<index_t, 3>& b1_g_n_l_strides_vec)
{
return Transform::MakeB1GridDescriptor_BK0_N_BK1(
Transform::MakeB1GridDescriptor_N_K(b1_g_n_l_lengths_vec, b1_g_n_l_strides_vec),
Number<L1>{});
}
__host__ __device__ static auto
MakeD0GridDescriptor(const std::array<index_t, 3>& d0_g_m_n_lengths_vec,
const std::array<index_t, 3>& d0_g_m_n_strides_vec)
{
return Transform::MakeCGridDescriptor_M_N(d0_g_m_n_lengths_vec, d0_g_m_n_strides_vec);
}
__host__ __device__ static auto MakeD0sGridDescriptor(
const std::array<std::array<index_t, 3>, NumD0Tensor>& d0_g_m_n_lengths_vec,
const std::array<std::array<index_t, 3>, NumD0Tensor>& d0_g_m_n_strides_vec)
{
return generate_tuple(
[&](auto i) {
return MakeD0GridDescriptor(d0_g_m_n_lengths_vec[i], d0_g_m_n_strides_vec[i]);
},
Number<NumD0Tensor>{});
}
__host__ __device__ static auto MakeD1sGridDescriptor(
const std::array<std::array<index_t, 3>, NumD1Tensor>& d1_g_m_o_lengths_vec,
const std::array<std::array<index_t, 3>, NumD1Tensor>& d1_g_m_o_strides_vec)
{
return generate_tuple(
[&](auto i) {
return MakeE1GridDescriptor(d1_g_m_o_lengths_vec[i], d1_g_m_o_strides_vec[i]);
},
Number<NumD1Tensor>{});
}
__host__ __device__ static auto
MakeE1GridDescriptor(const std::array<index_t, 3>& e1_g_m_n_lengths_vec,
const std::array<index_t, 3>& e1_g_m_n_strides_vec)
{
return Transform::MakeCGridDescriptor_M_N(e1_g_m_n_lengths_vec, e1_g_m_n_strides_vec);
}
};
using AGridDesc = decltype(GridDescriptorCreator::MakeAGridDescriptor({}, {}));
using B0GridDesc = decltype(GridDescriptorCreator::MakeB0GridDescriptor({}, {}));
using D0sGridDesc =
remove_cvref_t<decltype(GridDescriptorCreator::MakeD0sGridDescriptor({}, {}))>;
using B1GridDesc = decltype(GridDescriptorCreator::MakeB1GridDescriptor({}, {}));
using D1sGridDesc =
remove_cvref_t<decltype(GridDescriptorCreator::MakeD1sGridDescriptor({}, {}))>;
using E1GridDesc = decltype(GridDescriptorCreator::MakeE1GridDescriptor({}, {}));
using CGridDesc_M_N =
decltype(GridDescriptorCreator::Transform::MakeCGridDescriptor_M_N({}, {}));
struct ComputeBasePtrOfStridedBatch
{
ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
index_t BatchStrideB0,
index_t BatchStrideB1,
index_t BatchStrideC)
: BatchStrideA_(BatchStrideA),
BatchStrideB0_(BatchStrideB0),
BatchStrideB1_(BatchStrideB1),
BatchStrideC_E1_(BatchStrideC)
{
}
ComputeBasePtrOfStridedBatch(index_t BatchStrideA0,
index_t BatchStrideB0,
std::array<index_t, NumD0Tensor> BatchStrideD0s,
index_t BatchStrideB1,
std::array<index_t, NumD1Tensor> BatchStrideD1s,
index_t BatchStrideE1)
: BatchStrideA_(BatchStrideA0),
BatchStrideB0_(BatchStrideB0),
BatchStrideD0s_(BatchStrideD0s),
BatchStrideB1_(BatchStrideB1),
BatchStrideD1s_(BatchStrideD1s),
BatchStrideC_E1_(BatchStrideE1)
{
}
__host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideA_);
}
__host__ __device__ constexpr long_index_t GetB0BasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideB0_);
}
__host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideB1_);
}
__host__ __device__ constexpr long_index_t GetCE1BasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideC_E1_);
}
template <index_t I>
__host__ __device__ constexpr long_index_t GetD0BasePtr(index_t g_idx,
Number<I> d0_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideD0s_[d0_idx]);
}
template <index_t I>
__host__ __device__ constexpr long_index_t GetD1BasePtr(index_t g_idx,
Number<I> d1_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideD1s_[d1_idx]);
}
private:
index_t BatchStrideA_;
index_t BatchStrideB0_;
std::array<index_t, NumD0Tensor> BatchStrideD0s_;
index_t BatchStrideB1_;
std::array<index_t, NumD1Tensor> BatchStrideD1s_;
index_t BatchStrideC_E1_;
};
};
template <typename DeviceOp,
GemmSpecialization GemmSpec,
typename ALayout,
typename B0layout,
typename D0sLayout,
typename B1Layout,
typename D1sLayout,
typename CE1Layout,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t LPerBlock, // Gemm0NPerBlock
ck::index_t KPerBlock, // Gemm0KPerBlock
ck::index_t NPerBlock, // Gemm1NPerBlock
typename ADataType,
typename B0DataType,
typename B1DataType,
typename AccDataType,
typename CE1DataType,
typename D0sDataType,
typename D1sDataType,
typename AElementwiseOperation,
typename B0ElementwiseOperation,
typename AccElementwiseOperation,
typename B1ElementwiseOperation,
typename CDE1ElementwiseOperation,
ck::index_t AK1,
ck::index_t BK1,
ck::index_t L1, // B1K1
ck::index_t MPerWmma, // Gemm0/1 MPerWmma
ck::index_t LPerWmma, // Gemm0/1 NPerWmma
BlockGemmPipelineVersion BlkGemmPipelineVer,
ck::index_t ABlockTransferSrcVectorDim,
ck::index_t ABlockTransferSrcScalarPerVector,
ck::index_t B0BlockTransferSrcVectorDim,
ck::index_t B0BlockTransferSrcScalarPerVector,
ck::index_t B1BlockTransferSrcVectorDim,
ck::index_t B1BlockTransferSrcScalarPerVector,
ck::index_t CDE0BlockTransferSrcScalarPerVector,
ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
bool IsMultiD = false>
struct DeviceGemmGemm_Wmma_CShuffleV3_Common_Invoker_Arg
{
using GridwiseGemm = typename DeviceOp::GridwiseOp;
using Common =
DeviceGemmGemm_Wmma_CShuffleV3_Common<DeviceOp,
GemmSpec,
ALayout,
B0layout,
D0sLayout,
B1Layout,
D1sLayout,
CE1Layout,
BlockSize,
MPerBlock,
LPerBlock,
KPerBlock,
NPerBlock,
ADataType,
B0DataType,
B1DataType,
AccDataType,
CE1DataType,
D0sDataType,
D1sDataType,
AElementwiseOperation,
B0ElementwiseOperation,
AccElementwiseOperation,
B1ElementwiseOperation,
CDE1ElementwiseOperation,
AK1,
BK1,
L1,
MPerWmma,
LPerWmma,
BlkGemmPipelineVer,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
B0BlockTransferSrcVectorDim,
B0BlockTransferSrcScalarPerVector,
B1BlockTransferSrcVectorDim,
B1BlockTransferSrcScalarPerVector,
CDE0BlockTransferSrcScalarPerVector,
CShuffleBlockTransferScalarPerVector_NPerBlock,
IsMultiD>;
static constexpr auto NumD0Tensor = Common::NumD0Tensor;
static constexpr auto NumD1Tensor = Common::NumD1Tensor;
struct Argument : public BaseArgument
{
using arr3 = std::array<ck::index_t, 3>;
Argument(const ADataType* p_a_grid_,
const B0DataType* p_b0_grid_,
std::array<const void*, NumD0Tensor> p_d0s_grid_,
const B1DataType* p_b1_grid_,
std::array<const void*, NumD1Tensor> p_d1s_grid_,
CE1DataType* p_e1_grid_,
index_t M_,
index_t N_,
index_t K_,
index_t O_,
index_t Batch,
index_t StrideA,
index_t StrideB0,
std::array<index_t, NumD0Tensor> StrideD0s,
index_t StrideB1,
std::array<index_t, NumD1Tensor> StrideD1s,
index_t StrideE1,
index_t BatchStrideA,
index_t BatchStrideB0,
std::array<index_t, NumD0Tensor> BatchStrideD0s,
index_t BatchStrideB1,
std::array<index_t, NumD1Tensor> BatchStrideD1s,
index_t BatchStrideE1,
AElementwiseOperation a_element_op_,
B0ElementwiseOperation b0_element_op_,
AccElementwiseOperation acc_element_op_,
B1ElementwiseOperation b1_element_op_,
CDE1ElementwiseOperation cde1_element_op_)
: p_a_grid{p_a_grid_},
p_b0_grid{p_b0_grid_},
p_d0s_grid{},
p_b1_grid{p_b1_grid_},
p_d1s_grid{},
p_c_e1_grid{p_e1_grid_},
M{M_},
N{N_},
K{K_},
O{O_},
batch_count{Batch},
a_element_op{a_element_op_},
b0_element_op{b0_element_op_},
acc_element_op{acc_element_op_},
b1_element_op{b1_element_op_},
cde1_element_op{cde1_element_op_},
compute_base_ptr_of_batch{BatchStrideA,
BatchStrideB0,
BatchStrideD0s,
BatchStrideB1,
BatchStrideD1s,
BatchStrideE1}
{
a_g_m_k_lengths = arr3{batch_count, M, K};
a_g_m_k_strides = arr3{BatchStrideA, StrideA, 1}; // A layout [batch_count, M, K]
b0_g_n_k_lengths = arr3{batch_count, N, K};
b0_g_n_k_strides = arr3{BatchStrideB0, StrideB0, 1}; // B0 layout [batch_count, N, K]
b1_g_o_n_lengths = arr3{batch_count, O, N};
b1_g_o_n_strides =
is_same_v<B1Layout, tensor_layout::gemm::RowMajor>
? arr3{BatchStrideB1, 1, StrideB1} // B1 layout [batch_count, N, O]
: arr3{BatchStrideB1, StrideB1, 1}; // B1 layout [batch_count, O, N]
e1_g_m_o_lengths = arr3{batch_count, M, O};
e1_g_m_o_strides = arr3{BatchStrideE1, StrideE1, 1}; // C layout [batch_count, M, O]
a_grid_desc = Common::GridDescriptorCreator::MakeAGridDescriptor(a_g_m_k_lengths,
a_g_m_k_strides);
b0_grid_desc = Common::GridDescriptorCreator::MakeB0GridDescriptor(b0_g_n_k_lengths,
b0_g_n_k_strides);
b1_grid_desc = Common::GridDescriptorCreator::MakeB1GridDescriptor(b1_g_o_n_lengths,
b1_g_o_n_strides);
c_e1_grid_desc_m_n = Common::GridDescriptorCreator::MakeE1GridDescriptor(
e1_g_m_o_lengths, e1_g_m_o_strides);
c_e1_grid_desc_mblock_mperblock_nblock_nperblock =
GridwiseGemm::MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
c_e1_grid_desc_m_n);
block_2_etile_map = GridwiseGemm::MakeDefaultBlock2ETileMap(c_e1_grid_desc_m_n, 1, 1);
if constexpr(IsMultiD)
{
static_for<0, NumD0Tensor, 1>{}([&](auto i) {
using D0DataType = remove_cvref_t<tuple_element_t<i.value, D0sDataType>>;
// D0s layout [batch_count, M, N]
d0s_g_m_n_lengths[i] = arr3{batch_count, M, N};
d0s_g_m_n_strides[i] = arr3{BatchStrideD0s[i], StrideD0s[i], 1};
// D0 pointer
p_d0s_grid(i) = static_cast<const D0DataType*>(p_d0s_grid_[i]);
});
// D0 desc
d0s_grid_desc = Common::GridDescriptorCreator::MakeD0sGridDescriptor(
d0s_g_m_n_lengths, d0s_g_m_n_strides);
static_for<0, NumD1Tensor, 1>{}([&](auto i) {
using D1DataType = remove_cvref_t<tuple_element_t<i.value, D1sDataType>>;
// D1s layout [batch_count, M, O]
d1s_g_m_o_lengths[i] = arr3{batch_count, M, O};
d1s_g_m_o_strides[i] = arr3{BatchStrideD1s[i], StrideD1s[i], 1};
// D1 pointer
p_d1s_grid(i) = static_cast<const D1DataType*>(p_d1s_grid_[i]);
});
// D1 desc
d1s_grid_desc = Common::GridDescriptorCreator::MakeD1sGridDescriptor(
d1s_g_m_o_lengths, d1s_g_m_o_strides);
d1s_grid_desc_mblock_mperblock_nblock_nperblock =
GridwiseGemm::MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
d1s_grid_desc);
}
}
// Pointers
const ADataType* p_a_grid;
const B0DataType* p_b0_grid;
typename GridwiseGemm::D0sGridPointer p_d0s_grid;
const B1DataType* p_b1_grid;
typename GridwiseGemm::D1sGridPointer p_d1s_grid;
CE1DataType* p_c_e1_grid;
// Raw Problem Size
index_t M;
index_t N;
index_t K;
index_t O;
index_t batch_count;
arr3 a_g_m_k_lengths;
arr3 a_g_m_k_strides;
arr3 b0_g_n_k_lengths;
arr3 b0_g_n_k_strides;
std::array<arr3, NumD0Tensor> d0s_g_m_n_lengths;
std::array<arr3, NumD0Tensor> d0s_g_m_n_strides;
arr3 b1_g_o_n_lengths;
arr3 b1_g_o_n_strides;
std::array<arr3, NumD1Tensor> d1s_g_m_o_lengths;
std::array<arr3, NumD1Tensor> d1s_g_m_o_strides;
arr3 e1_g_m_o_lengths;
arr3 e1_g_m_o_strides;
AElementwiseOperation a_element_op;
B0ElementwiseOperation b0_element_op;
AccElementwiseOperation acc_element_op;
B1ElementwiseOperation b1_element_op;
CDE1ElementwiseOperation cde1_element_op;
// Grid descriptors and other mem calculators
typename Common::AGridDesc a_grid_desc;
typename Common::B0GridDesc b0_grid_desc;
std::conditional_t<IsMultiD, typename Common::D0sGridDesc, Tuple<>> d0s_grid_desc;
typename Common::B1GridDesc b1_grid_desc;
typename Common::D1sGridDesc d1s_grid_desc;
std::conditional_t<
IsMultiD,
typename GridwiseGemm::D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
Tuple<>>
d1s_grid_desc_mblock_mperblock_nblock_nperblock;
std::conditional_t<IsMultiD, typename Common::E1GridDesc, typename Common::CGridDesc_M_N>
c_e1_grid_desc_m_n;
typename GridwiseGemm::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
c_e1_grid_desc_mblock_mperblock_nblock_nperblock;
typename GridwiseGemm::DefaultBlock2ETileMap block_2_etile_map;
typename Common::ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch;
};
/// @brief Helper structure responsible for kernel invocation.
///
/// @paragraph The `Invoker` class is responsible for preparation and invocation of actual GPU
/// kernel function. It usually determines the launched grid size prepares kernel
/// arguments as well as perform specific kernel configuration selection based on
/// runtime arguments.
///
struct Invoker : public BaseInvoker
{
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto M0 = math::integer_divide_ceil(arg.M, MPerBlock);
const auto N0 = math::integer_divide_ceil(arg.O, NPerBlock);
const index_t grid_size = arg.batch_count * M0 * N0;
auto launch_kernel = [&](auto has_main_k_block_loop, auto tail_number) {
constexpr bool has_loop = decltype(has_main_k_block_loop)::value;
constexpr TailNumber tail_num = decltype(tail_number)::value;
const auto kernel = kernel_batched_gemm_gemm_wmma_cshuffle_v3<DeviceOp,
GridwiseGemm,
has_loop,
tail_num,
IsMultiD>;
return launch_and_time_kernel(
stream_config, kernel, dim3(grid_size), dim3(BlockSize), 0, arg);
};
bool HasMainKBlockLoop = GridwiseGemm::CalculateHasMainKBlockLoop(arg.K);
TailNumber TailNum = GridwiseGemm::CalculateKBlockLoopTailNum(arg.K);
if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
{
if(HasMainKBlockLoop && TailNum == TailNumber::Full)
{
return launch_kernel(std::integral_constant<bool, true>{},
std::integral_constant<TailNumber, TailNumber::Full>{});
}
else if(!HasMainKBlockLoop && TailNum == TailNumber::Full)
{
return launch_kernel(std::integral_constant<bool, false>{},
std::integral_constant<TailNumber, TailNumber::Full>{});
}
else
{
printf("Invalid HasMainKBlockLoop and TailNum combination for V1!\n");
return 0.0f;
}
}
else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
{
if(HasMainKBlockLoop && TailNum == TailNumber::Full)
{
return launch_kernel(std::integral_constant<bool, true>{},
std::integral_constant<TailNumber, TailNumber::Full>{});
}
else if(!HasMainKBlockLoop && TailNum == TailNumber::Even)
{
return launch_kernel(std::integral_constant<bool, false>{},
std::integral_constant<TailNumber, TailNumber::Even>{});
}
else if(!HasMainKBlockLoop && TailNum == TailNumber::Odd)
{
return launch_kernel(std::integral_constant<bool, false>{},
std::integral_constant<TailNumber, TailNumber::Odd>{});
}
else
{
printf("Invalid HasMainKBlockLoop and TailNum combination for V3!\n");
return 0.0f;
}
}
else
{
printf("Invalid pipeline version!\n");
return 0.0f;
}
}
// polymorphic
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
static constexpr bool IsValidCompilationParameter()
{
// TODO: properly implement this check
return true;
}
// check if DsLayout is supported
template <typename RefLayout, typename DsLayout, const index_t NumDTensor>
static constexpr bool CheckDLayout()
{
bool valid = true;
// iterate over DLayout tuple
static_for<0, NumDTensor, 1>{}([&](auto i) {
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
// if RefLayout and DLayout are same, keep valid true, otherwise false
valid = valid && is_same_v<RefLayout, DLayout>;
});
return valid;
}
static bool IsSupportedArgument(const Argument& arg)
{
// Print lambda with env check and printf() style formmating.
const char* curFunc = __func__;
auto print = [&curFunc](const char* format, ...) -> void {
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wformat-nonliteral"
#endif
va_list args;
va_start(args, format);
std::vfprintf(stdout, format, args);
va_end(args);
#if defined(__clang__)
#pragma clang diagnostic pop
#endif
std::cout << "In file: " << __FILE__ << ", function: " << curFunc << "\n";
}
};
if(!(ck::is_gfx11_supported() || ck::is_gfx12_supported()))
{
print("DeviceOp: Arch err\n");
return false;
}
if constexpr(std::is_same_v<ADataType, f8_t> || std::is_same_v<ADataType, bf8_t> ||
std::is_same_v<B0DataType, f8_t> || std::is_same_v<B0DataType, bf8_t> ||
std::is_same_v<B1DataType, f8_t> || std::is_same_v<B1DataType, bf8_t>)
{
if(ck::is_gfx11_supported())
{
print("DeviceOp: gfx 11 does not support fp8\n");
return false;
}
}
if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
{
print("DeviceOp: Acc0 Type err\n");
return false;
}
if constexpr(!(is_same_v<ALayout, tensor_layout::gemm::RowMajor>))
{
print("DeviceOp: A layout must be Row\n");
return false;
}
if constexpr(!(is_same_v<B1Layout, tensor_layout::gemm::RowMajor> ||
is_same_v<B1Layout, tensor_layout::gemm::ColumnMajor>))
{
print("DeviceOp: B1 layout must be Column or Row\n");
return false;
}
if constexpr(!(is_same_v<CE1Layout, tensor_layout::gemm::RowMajor>))
{
print("DeviceOp: C layout must be Row\n");
return false;
}
// Other padding modes have not been tested and do not get checked individually.
if constexpr(GemmSpec != GemmSpecialization::Default &&
GemmSpec != GemmSpecialization::MNKOPadding)
{
print("Padding mode must be default or MNKO\n");
return false;
}
// Per wmma dimensions not equal to 16 are very untested.
if constexpr(MPerWmma != 16 || LPerWmma != 16 || DeviceOp::NPerWmma != 16)
{
print("M, L, N per Wmma must be 16\n");
return false;
}
if constexpr(IsMultiD)
{
if constexpr(!(is_same_v<B0layout, tensor_layout::gemm::ColumnMajor>))
{
print("DeviceOp: B0 layout must be Column\n");
return false;
}
if constexpr(!(CheckDLayout<tensor_layout::gemm::RowMajor, D0sLayout, NumD0Tensor>()))
{
print("DeviceOp: All D0s layout must be Row\n");
return false;
}
if constexpr(!(CheckDLayout<tensor_layout::gemm::RowMajor, D1sLayout, NumD1Tensor>()))
{
print("DeviceOp: All D1s layout must be Row\n");
return false;
}
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc,
arg.b0_grid_desc,
arg.d0s_grid_desc,
arg.b1_grid_desc,
arg.d1s_grid_desc,
arg.c_e1_grid_desc_m_n,
arg.block_2_etile_map))
{
return false;
}
// Check scalar per vector requirement
const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? arg.K : arg.M;
const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? arg.K : arg.N;
const auto cde0_extent_lowest = arg.N; // D0 tensors forced to be row-major
const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? arg.N : arg.O;
const auto cde1_extent_lowest = arg.O;
if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
cde0_extent_lowest % CDE0BlockTransferSrcScalarPerVector == 0 &&
b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
cde1_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
{
print("DeviceOp: Data Transfer Vector scalar err\n");
return false;
}
// Check vector load/store requirement
const auto a_stride_lowest =
ABlockTransferSrcVectorDim == 2 ? arg.a_g_m_k_strides[2] : arg.a_g_m_k_strides[1];
const auto b0_stride_lowest = B0BlockTransferSrcVectorDim == 2
? arg.b0_g_n_k_strides[2]
: arg.b0_g_n_k_strides[1];
const auto b1_stride_lowest = B1BlockTransferSrcVectorDim == 2
? arg.b1_g_o_n_strides[2]
: arg.b1_g_o_n_strides[1];
const auto e1_stride_lowest = arg.e1_g_m_o_strides[2];
// NOTE: We don't check D0s/D1s stride, as they are already forced to be row-major
// and the lowest dimension stride is hardcoded to 1
if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
e1_stride_lowest == 1))
{
print("DeviceOp: Data Vectorize transfer err\n");
return false;
}
}
else
{
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc,
arg.b0_grid_desc,
Tuple<>{},
arg.b1_grid_desc,
Tuple<>{},
arg.c_e1_grid_desc_m_n,
arg.block_2_etile_map))
{
return false;
}
// Check scalar per vector requirement
const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? arg.K : arg.M;
const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? arg.K : arg.N;
const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? arg.N : arg.O;
const auto c_extent_lowest = arg.O;
if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
{
print("DeviceOp: Data Transfer Vector scalar err\n");
return false;
}
// Check vector load/store requirement
const auto a_stride_lowest =
ABlockTransferSrcVectorDim == 2 ? arg.a_g_m_k_strides[2] : arg.a_g_m_k_strides[1];
const auto b0_stride_lowest = B0BlockTransferSrcVectorDim == 2
? arg.b0_g_n_k_strides[2]
: arg.b0_g_n_k_strides[1];
const auto b1_stride_lowest = B1BlockTransferSrcVectorDim == 2
? arg.b1_g_o_n_strides[2]
: arg.b1_g_o_n_strides[1];
const auto c_stride_lowest = arg.e1_g_m_o_strides[2];
if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
c_stride_lowest == 1))
{
print("DeviceOp: Data Vectorize transfer err\n");
return false;
}
}
if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MNKOPadding))
{
return false;
}
return true;
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -39,7 +39,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_gemm_xdl_cshuffle_v1(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,

View File

@@ -63,7 +63,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_xdl(const ABDataType* __restrict__ p_a_grid,
const ABDataType* __restrict__ p_b_grid,
@@ -99,7 +99,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
DsPointer p_ds_grid_grp;

View File

@@ -3,91 +3,20 @@
#pragma once
#include <iostream>
#include <sstream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3_common.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp"
#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <typename DeviceOp, typename GridwiseOp, bool HasMainKBlockLoop, TailNumber TailNum>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_multiple_d_gemm_multiple_d_wmma_cshuffle_v3(typename DeviceOp::RawArg arg)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
__shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()];
const index_t num_blocks_per_batch =
__builtin_amdgcn_readfirstlane(get_grid_size() / arg.batch_count);
const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
const long_index_t a_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetABasePtr(g_idx)));
const long_index_t b0_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetB0BasePtr(g_idx)));
const long_index_t b1_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
const long_index_t e1_batch_offset =
__builtin_amdgcn_readfirstlane((arg.compute_base_ptr_of_batch.GetE1BasePtr(g_idx)));
auto p_d0s_grid = GridwiseOp::MakeD0sGridPointer();
auto p_d1s_grid = GridwiseOp::MakeD1sGridPointer();
static_for<0, DeviceOp::NumD0Tensor, 1>{}([&](auto In) {
const long_index_t d0_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<long_index_t>(arg.compute_base_ptr_of_batch.GetD0BasePtr(g_idx, In)));
p_d0s_grid(In) = arg.p_d0s_grid(In) + d0_batch_offset;
});
static_for<0, DeviceOp::NumD1Tensor, 1>{}([&](auto In) {
const long_index_t d1_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<long_index_t>(arg.compute_base_ptr_of_batch.GetD1BasePtr(g_idx, In)));
p_d1s_grid(In) = arg.p_d1s_grid(In) + d1_batch_offset;
});
GridwiseOp::template Run<HasMainKBlockLoop, TailNum>(
arg.p_a_grid + a_batch_offset,
arg.p_b0_grid + b0_batch_offset,
p_d0s_grid,
arg.p_b1_grid + b1_batch_offset,
p_d1s_grid,
arg.p_e1_grid + e1_batch_offset,
p_shared,
arg.a_grid_desc,
arg.b0_grid_desc,
arg.d0s_grid_desc,
arg.b1_grid_desc,
arg.d1s_grid_desc_mblock_mperblock_nblock_nperblock,
arg.e1_grid_desc_mblock_mperblock_nblock_nperblock,
arg.a_element_op,
arg.b0_element_op,
arg.acc_element_op,
arg.b1_element_op,
arg.cde1_element_op,
arg.block_2_etile_map);
#else
ignore = arg;
#endif // (!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__)
}
// Computes:
// Acc = Acc_Op(A_Op(A) * B0_Op(B0), D0_0, D0_1, ...)
// E = CDE1_Op(Acc_Op(Acc0) * B1_Op(B1), D1_0, D1_1, ...)
@@ -184,151 +113,51 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Wmma_CShuffleV3
static constexpr index_t NumD0Tensor = D0sDataType::Size();
static constexpr index_t NumD1Tensor = D1sDataType::Size();
static constexpr auto I0 = Number<0>{};
// To match XDL implementation NPerWmma (A.k.a Gemm1 NPerWmma) is set equal
// to LPerWmma (A.k.a Gemm0 NPerWmma).
static constexpr index_t NPerWmma = LPerWmma;
// TODO: Now that we are no longer using NumDim or TensorSpec, we can probably use a simpler
// Transform operator or just not use one at all.
using Transform = TransformBatchedContractionContractionToBatchedGemmGemm_Wmma<
Sequence<1, 1, 1, 1, 1>,
Sequence<MPerBlock, LPerBlock, KPerBlock, NPerBlock>,
GemmSpec,
TensorSpecialization::Default, // ASpec
TensorSpecialization::Default, // B0Spec
TensorSpecialization::Default, // B1Spec
TensorSpecialization::Default>; // CSpec
__host__ __device__ static auto
MakeAGridDescriptor(const std::array<index_t, 3>& a_g_m_k_lengths_vec,
const std::array<index_t, 3>& a_g_m_k_strides_vec)
{
return Transform::MakeAGridDescriptor_AK0_M_AK1(
Transform::MakeAGridDescriptor_M_K(a_g_m_k_lengths_vec, a_g_m_k_strides_vec),
Number<AK1>{});
}
__host__ __device__ static auto
MakeB0GridDescriptor(const std::array<index_t, 3>& b0_g_l_k_lengths_vec,
const std::array<index_t, 3>& b0_g_l_k_strides_vec)
{
return Transform::MakeB0GridDescriptor_BK0_N_BK1(
Transform::MakeB0GridDescriptor_N_K(b0_g_l_k_lengths_vec, b0_g_l_k_strides_vec),
Number<BK1>{});
}
__host__ __device__ static auto
MakeB1GridDescriptor(const std::array<index_t, 3>& b1_g_n_l_lengths_vec,
const std::array<index_t, 3>& b1_g_n_l_strides_vec)
{
return Transform::MakeB1GridDescriptor_BK0_N_BK1(
Transform::MakeB1GridDescriptor_N_K(b1_g_n_l_lengths_vec, b1_g_n_l_strides_vec),
Number<L1>{});
}
__host__ __device__ static auto
MakeD0GridDescriptor(const std::array<index_t, 3>& d0_g_m_n_lengths_vec,
const std::array<index_t, 3>& d0_g_m_n_strides_vec)
{
return Transform::MakeCGridDescriptor_M_N(d0_g_m_n_lengths_vec, d0_g_m_n_strides_vec);
}
__host__ __device__ static auto MakeD0sGridDescriptor(
const std::array<std::array<index_t, 3>, NumD0Tensor>& d0_g_m_n_lengths_vec,
const std::array<std::array<index_t, 3>, NumD0Tensor>& d0_g_m_n_strides_vec)
{
return generate_tuple(
[&](auto i) {
return MakeD0GridDescriptor(d0_g_m_n_lengths_vec[i], d0_g_m_n_strides_vec[i]);
},
Number<NumD0Tensor>{});
}
__host__ __device__ static auto MakeD1sGridDescriptor(
const std::array<std::array<index_t, 3>, NumD0Tensor>& d1_g_m_o_lengths_vec,
const std::array<std::array<index_t, 3>, NumD0Tensor>& d1_g_m_o_strides_vec)
{
return generate_tuple(
[&](auto i) {
return MakeE1GridDescriptor(d1_g_m_o_lengths_vec[i], d1_g_m_o_strides_vec[i]);
},
Number<NumD1Tensor>{});
}
__host__ __device__ static auto
MakeE1GridDescriptor(const std::array<index_t, 3>& e1_g_m_n_lengths_vec,
const std::array<index_t, 3>& e1_g_m_n_strides_vec)
{
return Transform::MakeCGridDescriptor_M_N(e1_g_m_n_lengths_vec, e1_g_m_n_strides_vec);
}
using AGridDesc = decltype(MakeAGridDescriptor({}, {}));
using B0GridDesc = decltype(MakeB0GridDescriptor({}, {}));
using D0sGridDesc = remove_cvref_t<decltype(MakeD0sGridDescriptor({}, {}))>;
using B1GridDesc = decltype(MakeB1GridDescriptor({}, {}));
using D1sGridDesc = remove_cvref_t<decltype(MakeD1sGridDescriptor({}, {}))>;
using E1GridDesc = decltype(MakeE1GridDescriptor({}, {}));
struct ComputeBasePtrOfStridedBatch
{
ComputeBasePtrOfStridedBatch(index_t BatchStrideA0,
index_t BatchStrideB0,
std::array<index_t, NumD0Tensor> BatchStrideD0s,
index_t BatchStrideB1,
std::array<index_t, NumD1Tensor> BatchStrideD1s,
index_t BatchStrideE1)
: BatchStrideA0_(BatchStrideA0),
BatchStrideB0_(BatchStrideB0),
BatchStrideD0s_(BatchStrideD0s),
BatchStrideB1_(BatchStrideB1),
BatchStrideD1s_(BatchStrideD1s),
BatchStrideE1_(BatchStrideE1)
{
}
__host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideA0_);
}
__host__ __device__ constexpr long_index_t GetB0BasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideB0_);
}
template <index_t I>
__host__ __device__ constexpr long_index_t GetD0BasePtr(index_t g_idx,
Number<I> d1_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideD0s_[d1_idx]);
}
__host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideB1_);
}
__host__ __device__ constexpr long_index_t GetE1BasePtr(index_t g_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideE1_);
}
template <index_t I>
__host__ __device__ constexpr auto GetD1BasePtr(index_t g_idx, Number<I> d1_idx) const
{
return g_idx * static_cast<long_index_t>(BatchStrideD1s_[d1_idx]);
}
private:
index_t BatchStrideA0_;
index_t BatchStrideB0_;
std::array<index_t, NumD0Tensor> BatchStrideD0s_;
index_t BatchStrideB1_;
std::array<index_t, NumD1Tensor> BatchStrideD1s_;
index_t BatchStrideE1_;
};
using DeviceGemmGemmCommonBase =
DeviceGemmGemm_Wmma_CShuffleV3_Common<DeviceOp,
GemmSpec,
ALayout,
B0layout,
D0sLayout,
B1Layout,
D1sLayout,
E1Layout,
BlockSize,
MPerBlock,
LPerBlock,
KPerBlock,
NPerBlock,
ADataType,
B0DataType,
B1DataType,
AccDataType,
E1DataType,
D0sDataType,
D1sDataType,
AElementwiseOperation,
B0ElementwiseOperation,
AccElementwiseOperation,
B1ElementwiseOperation,
CDE1ElementwiseOperation,
AK1,
BK1,
L1,
MPerWmma,
LPerWmma,
BlkGemmPipelineVer,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
B0BlockTransferSrcVectorDim,
B0BlockTransferSrcScalarPerVector,
B1BlockTransferSrcVectorDim,
B1BlockTransferSrcScalarPerVector,
CDE0BlockTransferSrcScalarPerVector,
CShuffleBlockTransferScalarPerVector_NPerBlock,
true>; // IsMultiD
// GridwiseOp
using GridwiseOp = GridwiseBatchedGemmGemm_wmma_cshuffle_v3<
@@ -350,12 +179,12 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Wmma_CShuffleV3
CDE1ElementwiseOperation,
InMemoryDataOperationEnum::Set,
// InMemory Data Descriptor
AGridDesc,
B0GridDesc,
D0sGridDesc,
B1GridDesc,
D1sGridDesc,
E1GridDesc,
typename DeviceGemmGemmCommonBase::AGridDesc,
typename DeviceGemmGemmCommonBase::B0GridDesc,
typename DeviceGemmGemmCommonBase::D0sGridDesc,
typename DeviceGemmGemmCommonBase::B1GridDesc,
typename DeviceGemmGemmCommonBase::D1sGridDesc,
typename DeviceGemmGemmCommonBase::E1GridDesc,
// Tiling Family
MPerBlock,
LPerBlock,
@@ -402,430 +231,67 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Wmma_CShuffleV3
CShuffleNRepeatPerShuffle,
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
CShuffleBlockTransferScalarPerVector_NPerBlock,
Transform::matrix_padder.PadN,
DeviceGemmGemmCommonBase::GridDescriptorCreator::Transform::matrix_padder.PadN,
BlkGemmPipeSched,
BlkGemmPipelineVer>;
struct RawArg : public BaseArgument
using DeviceGemmGemmCommon = DeviceGemmGemm_Wmma_CShuffleV3_Common_Invoker_Arg<
DeviceOp,
GemmSpec,
ALayout,
B0layout,
D0sLayout,
B1Layout,
D1sLayout,
E1Layout,
BlockSize,
MPerBlock,
LPerBlock,
KPerBlock,
NPerBlock,
ADataType,
B0DataType,
B1DataType,
AccDataType,
E1DataType,
D0sDataType,
D1sDataType,
AElementwiseOperation,
B0ElementwiseOperation,
AccElementwiseOperation,
B1ElementwiseOperation,
CDE1ElementwiseOperation,
AK1,
BK1,
L1,
MPerWmma,
LPerWmma,
BlkGemmPipelineVer,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
B0BlockTransferSrcVectorDim,
B0BlockTransferSrcScalarPerVector,
B1BlockTransferSrcVectorDim,
B1BlockTransferSrcScalarPerVector,
CDE0BlockTransferSrcScalarPerVector,
CShuffleBlockTransferScalarPerVector_NPerBlock,
true>; // IsMultiD
// Invoker
using Invoker = typename DeviceGemmGemmCommon::Invoker;
// Argument
using Argument = typename DeviceGemmGemmCommon::Argument;
static bool IsSupportedArgument(const Argument& arg)
{
using arr3 = std::array<ck::index_t, 3>;
RawArg(const ADataType* p_a_grid_,
const B0DataType* p_b0_grid_,
std::array<const void*, NumD0Tensor> p_d0s_grid_,
const B1DataType* p_b1_grid_,
std::array<const void*, NumD1Tensor> p_d1s_grid_,
E1DataType* p_e1_grid_,
index_t M_,
index_t N_,
index_t K_,
index_t O_,
index_t Batch,
index_t StrideA,
index_t StrideB0,
std::array<index_t, NumD0Tensor> StrideD0s,
index_t StrideB1,
std::array<index_t, NumD1Tensor> StrideD1s,
index_t StrideE1,
index_t BatchStrideA,
index_t BatchStrideB0,
std::array<index_t, NumD0Tensor> BatchStrideD0s,
index_t BatchStrideB1,
std::array<index_t, NumD1Tensor> BatchStrideD1s,
index_t BatchStrideE1,
AElementwiseOperation a_element_op_,
B0ElementwiseOperation b0_element_op_,
AccElementwiseOperation acc_element_op_,
B1ElementwiseOperation b1_element_op_,
CDE1ElementwiseOperation cde1_element_op_)
: p_a_grid{p_a_grid_},
p_b0_grid{p_b0_grid_},
p_d0s_grid{},
p_b1_grid{p_b1_grid_},
p_d1s_grid{},
p_e1_grid{p_e1_grid_},
M{M_},
N{N_},
K{K_},
O{O_},
batch_count{Batch},
a_element_op{a_element_op_},
b0_element_op{b0_element_op_},
acc_element_op{acc_element_op_},
b1_element_op{b1_element_op_},
cde1_element_op{cde1_element_op_},
compute_base_ptr_of_batch{BatchStrideA,
BatchStrideB0,
BatchStrideD0s,
BatchStrideB1,
BatchStrideD1s,
BatchStrideE1}
{
a_g_m_k_lengths = arr3{batch_count, M, K};
a_g_m_k_strides = arr3{BatchStrideA, StrideA, 1}; // A layout [batch_count, M, K]
b0_g_n_k_lengths = arr3{batch_count, N, K};
b0_g_n_k_strides = arr3{BatchStrideB0, StrideB0, 1}; // B0 layout [batch_count, N, K]
b1_g_o_n_lengths = arr3{batch_count, O, N};
b1_g_o_n_strides =
is_same_v<B1Layout, tensor_layout::gemm::RowMajor>
? arr3{BatchStrideB1, 1, StrideB1} // B1 layout [batch_count, N, O]
: arr3{BatchStrideB1, StrideB1, 1}; // B1 layout [batch_count, O, N]
e1_g_m_o_lengths = arr3{batch_count, M, O};
e1_g_m_o_strides = arr3{BatchStrideE1, StrideE1, 1}; // C layout [batch_count, M, O]
a_grid_desc = MakeAGridDescriptor(a_g_m_k_lengths, a_g_m_k_strides);
b0_grid_desc = MakeB0GridDescriptor(b0_g_n_k_lengths, b0_g_n_k_strides);
b1_grid_desc = MakeB1GridDescriptor(b1_g_o_n_lengths, b1_g_o_n_strides);
e1_grid_desc_m_n = MakeE1GridDescriptor(e1_g_m_o_lengths, e1_g_m_o_strides);
e1_grid_desc_mblock_mperblock_nblock_nperblock =
GridwiseOp::MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
e1_grid_desc_m_n);
block_2_etile_map = GridwiseOp::MakeDefaultBlock2ETileMap(e1_grid_desc_m_n, 1, 1);
static_for<0, NumD0Tensor, 1>{}([&](auto i) {
using D0DataType = remove_cvref_t<tuple_element_t<i.value, D0sDataType>>;
// D0s layout [batch_count, M, N]
d0s_g_m_n_lengths[i] = arr3{batch_count, M, N};
d0s_g_m_n_strides[i] = arr3{BatchStrideD0s[i], StrideD0s[i], 1};
// D0 pointer
p_d0s_grid(i) = static_cast<const D0DataType*>(p_d0s_grid_[i]);
// D0 desc
d0s_grid_desc(i) = MakeD0GridDescriptor(d0s_g_m_n_lengths[i], d0s_g_m_n_strides[i]);
});
static_for<0, NumD1Tensor, 1>{}([&](auto i) {
using D1DataType = remove_cvref_t<tuple_element_t<i.value, D1sDataType>>;
// D1s layout [batch_count, M, O]
d1s_g_m_o_lengths[i] = arr3{batch_count, M, O};
d1s_g_m_o_strides[i] = arr3{BatchStrideD1s[i], StrideD1s[i], 1};
// D1 pointer
p_d1s_grid(i) = static_cast<const D1DataType*>(p_d1s_grid_[i]);
// D1 desc
d1s_grid_desc(i) = MakeE1GridDescriptor(d1s_g_m_o_lengths[i], d1s_g_m_o_strides[i]);
});
d1s_grid_desc_mblock_mperblock_nblock_nperblock =
GridwiseOp::MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(d1s_grid_desc);
}
// Pointers
const ADataType* p_a_grid;
const B0DataType* p_b0_grid;
typename GridwiseOp::D0sGridPointer p_d0s_grid;
const B1DataType* p_b1_grid;
typename GridwiseOp::D1sGridPointer p_d1s_grid;
E1DataType* p_e1_grid;
// Raw Problem Size
index_t M;
index_t N;
index_t K;
index_t O;
index_t batch_count;
arr3 a_g_m_k_lengths;
arr3 a_g_m_k_strides;
arr3 b0_g_n_k_lengths;
arr3 b0_g_n_k_strides;
std::array<arr3, NumD0Tensor> d0s_g_m_n_lengths;
std::array<arr3, NumD0Tensor> d0s_g_m_n_strides;
arr3 b1_g_o_n_lengths;
arr3 b1_g_o_n_strides;
std::array<arr3, NumD1Tensor> d1s_g_m_o_lengths;
std::array<arr3, NumD1Tensor> d1s_g_m_o_strides;
arr3 e1_g_m_o_lengths;
arr3 e1_g_m_o_strides;
AElementwiseOperation a_element_op;
B0ElementwiseOperation b0_element_op;
AccElementwiseOperation acc_element_op;
B1ElementwiseOperation b1_element_op;
CDE1ElementwiseOperation cde1_element_op;
// Grid descriptors and other mem calculators
AGridDesc a_grid_desc;
B0GridDesc b0_grid_desc;
D0sGridDesc d0s_grid_desc;
B1GridDesc b1_grid_desc;
D1sGridDesc d1s_grid_desc;
typename GridwiseOp::D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
d1s_grid_desc_mblock_mperblock_nblock_nperblock;
E1GridDesc e1_grid_desc_m_n;
typename GridwiseOp::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
e1_grid_desc_mblock_mperblock_nblock_nperblock;
typename GridwiseOp::DefaultBlock2ETileMap block_2_etile_map;
ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch;
};
// check if DsLayout is supported
template <typename RefLayout, typename DsLayout, const index_t NumDTensor>
static constexpr bool CheckDLayout()
{
bool valid = true;
// iterate over DLayout tuple
static_for<0, NumDTensor, 1>{}([&](auto i) {
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
// if RefLayout and DLayout are same, keep valid true, otherwise false
valid = valid && is_same_v<RefLayout, DLayout>;
});
return valid;
return DeviceGemmGemmCommon::IsSupportedArgument(arg);
}
static bool IsSupportedArgument([[maybe_unused]] const RawArg& arg)
{
// Print lambda with env check and printf() style formmating.
const char* curFunc = __func__;
auto print = [&curFunc](const char* format, ...) -> void {
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wformat-nonliteral"
#endif
va_list args;
va_start(args, format);
std::vfprintf(stdout, format, args);
va_end(args);
#if defined(__clang__)
#pragma clang diagnostic pop
#endif
std::cout << "In file: " << __FILE__ << ", function: " << curFunc << "\n";
}
};
if(!(ck::is_gfx11_supported() || ck::is_gfx12_supported()))
{
print("DeviceOp: Arch err\n");
return false;
}
if constexpr(std::is_same_v<ADataType, f8_t> || std::is_same_v<ADataType, bf8_t> ||
std::is_same_v<B0DataType, f8_t> || std::is_same_v<B0DataType, bf8_t> ||
std::is_same_v<B1DataType, f8_t> || std::is_same_v<B1DataType, bf8_t>)
{
if(ck::is_gfx11_supported())
{
print("DeviceOp: gfx 11 does not support fp8\n");
return false;
}
}
if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
{
print("DeviceOp: Acc0 Type err\n");
return false;
}
if constexpr(!(is_same_v<ALayout, tensor_layout::gemm::RowMajor>))
{
print("DeviceOp: A layout must be Row\n");
return false;
}
if constexpr(!(is_same_v<B0layout, tensor_layout::gemm::ColumnMajor>))
{
print("DeviceOp: B0 layout must be Column\n");
return false;
}
if constexpr(!(CheckDLayout<tensor_layout::gemm::RowMajor, D0sLayout, NumD0Tensor>()))
{
print("DeviceOp: All D0s layout must be Row\n");
return false;
}
if constexpr(!(is_same_v<B1Layout, tensor_layout::gemm::RowMajor> ||
is_same_v<B1Layout, tensor_layout::gemm::ColumnMajor>))
{
print("DeviceOp: B1 layout must be Column or Row\n");
return false;
}
if constexpr(!(CheckDLayout<tensor_layout::gemm::RowMajor, D1sLayout, NumD1Tensor>()))
{
print("DeviceOp: All D1s layout must be Row\n");
return false;
}
if constexpr(!(is_same_v<E1Layout, tensor_layout::gemm::RowMajor>))
{
print("DeviceOp: C layout must be Row\n");
return false;
}
// Other padding modes have not been tested and do not get checked individually.
if constexpr(GemmSpec != GemmSpecialization::Default &&
GemmSpec != GemmSpecialization::MNKOPadding)
{
print("Padding mode must be default or MNKO\n");
return false;
}
// Per wmma dimensions not equal to 16 are very untested.
if constexpr(MPerWmma != 16 || LPerWmma != 16 || NPerWmma != 16)
{
print("M, L, N per Wmma must be 16\n");
return false;
}
if(!GridwiseOp::CheckValidity(arg.a_grid_desc,
arg.b0_grid_desc,
arg.d0s_grid_desc,
arg.b1_grid_desc,
arg.d1s_grid_desc,
arg.e1_grid_desc_m_n,
arg.block_2_etile_map))
{
return false;
}
// Check scalar per vector requirement
const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? arg.K : arg.M;
const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? arg.K : arg.N;
const auto cde0_extent_lowest = arg.N; // D0 tensors forced to be row-major
const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? arg.N : arg.O;
const auto cde1_extent_lowest = arg.O;
if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
cde0_extent_lowest % CDE0BlockTransferSrcScalarPerVector == 0 &&
b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
cde1_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
{
print("DeviceOp: Data Transfer Vector scalar err\n");
return false;
}
// Check vector load/store requirement
const auto a_stride_lowest =
ABlockTransferSrcVectorDim == 2 ? arg.a_g_m_k_strides[2] : arg.a_g_m_k_strides[1];
const auto b0_stride_lowest =
B0BlockTransferSrcVectorDim == 2 ? arg.b0_g_n_k_strides[2] : arg.b0_g_n_k_strides[1];
const auto b1_stride_lowest =
B1BlockTransferSrcVectorDim == 2 ? arg.b1_g_o_n_strides[2] : arg.b1_g_o_n_strides[1];
const auto e1_stride_lowest = arg.e1_g_m_o_strides[2];
// NOTE: We don't check D0s/D1s stride, as they are already forced to be row-major
// and the lowest dimension stride is hardcoded to 1
if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
e1_stride_lowest == 1))
{
print("DeviceOp: Data Vectorize transfer err\n");
return false;
}
if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MNKOPadding))
{
return false;
}
return true;
}
// polymorphic
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
return IsSupportedArgument(*dynamic_cast<const RawArg*>(p_arg));
return DeviceGemmGemmCommon::IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
}
struct Invoker : public BaseInvoker
{
using Argument = DeviceOp::RawArg;
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto M0 = math::integer_divide_ceil(arg.M, MPerBlock);
const auto N0 = math::integer_divide_ceil(arg.O, NPerBlock);
const index_t grid_size = arg.batch_count * M0 * N0;
auto launch_kernel = [&](auto has_main_k_block_loop, auto tail_number) {
constexpr bool has_loop = decltype(has_main_k_block_loop)::value;
constexpr TailNumber tn = tail_number;
const auto kernel =
kernel_batched_gemm_multiple_d_gemm_multiple_d_wmma_cshuffle_v3<DeviceOp,
GridwiseOp,
has_loop,
tn>;
return launch_and_time_kernel(
stream_config, kernel, dim3(grid_size), dim3(BlockSize), 0, arg);
};
bool HasMainKBlockLoop = GridwiseOp::CalculateHasMainKBlockLoop(arg.K);
TailNumber TailNum = GridwiseOp::CalculateKBlockLoopTailNum(arg.K);
if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
{
if(HasMainKBlockLoop && TailNum == TailNumber::Full)
{
return launch_kernel(std::integral_constant<bool, true>{},
std::integral_constant<TailNumber, TailNumber::Full>{});
}
else if(!HasMainKBlockLoop && TailNum == TailNumber::Full)
{
return launch_kernel(std::integral_constant<bool, false>{},
std::integral_constant<TailNumber, TailNumber::Full>{});
}
else
{
printf("Invalid HasMainKBlockLoop and TailNum combination for V1!\n");
return 0.0f;
}
}
else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
{
if(HasMainKBlockLoop && TailNum == TailNumber::Full)
{
return launch_kernel(std::integral_constant<bool, true>{},
std::integral_constant<TailNumber, TailNumber::Full>{});
}
else if(!HasMainKBlockLoop && TailNum == TailNumber::Even)
{
return launch_kernel(std::integral_constant<bool, false>{},
std::integral_constant<TailNumber, TailNumber::Even>{});
}
else if(!HasMainKBlockLoop && TailNum == TailNumber::Odd)
{
return launch_kernel(std::integral_constant<bool, false>{},
std::integral_constant<TailNumber, TailNumber::Odd>{});
}
else
{
printf("Invalid HasMainKBlockLoop and TailNum combination for V3!\n");
return 0.0f;
}
}
else
{
printf("Invalid pipeline version!\n");
return 0.0f;
}
}
// polymorphic
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
static auto MakeArgument(const ADataType* p_a0,
const B0DataType* p_b0,
std::array<const void*, NumD0Tensor> p_d0s,
@@ -855,20 +321,20 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Wmma_CShuffleV3
B1ElementwiseOperation b1_element_op,
CDE1ElementwiseOperation cde1_element_op)
{
return RawArg{p_a0, p_b0,
p_d0s, p_b1,
p_d1s, p_e1,
MRaw, NRaw,
KRaw, Gemm1NRaw,
Batch, StrideA0,
StrideB0, StrideD0s,
StrideB1, StrideD1s,
StrideE1, BatchStrideA0,
BatchStrideB0, BatchStrideD0s,
BatchStrideB1, BatchStrideD1s,
BatchStrideE1, a0_element_op,
b0_element_op, cde0_element_op,
b1_element_op, cde1_element_op};
return Argument{p_a0, p_b0,
p_d0s, p_b1,
p_d1s, p_e1,
MRaw, NRaw,
KRaw, Gemm1NRaw,
Batch, StrideA0,
StrideB0, StrideD0s,
StrideB1, StrideD1s,
StrideE1, BatchStrideA0,
BatchStrideB0, BatchStrideD0s,
BatchStrideB1, BatchStrideD1s,
BatchStrideE1, a0_element_op,
b0_element_op, cde0_element_op,
b1_element_op, cde1_element_op};
}
// polymorphic
@@ -902,34 +368,34 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Wmma_CShuffleV3
B1ElementwiseOperation b1_element_op,
CDE1ElementwiseOperation c_element_op) override
{
return std::make_unique<RawArg>(static_cast<const ADataType*>(p_a),
static_cast<const B0DataType*>(p_b0),
p_d0s,
static_cast<const B1DataType*>(p_b1),
p_d1s,
static_cast<E1DataType*>(p_c),
M,
N,
K,
O,
Batch,
StrideA,
StrideB0,
StrideD0s,
StrideB1,
StrideD1s,
StrideE1,
BatchStrideA,
BatchStrideB0,
BatchStrideD0s,
BatchStrideB1,
BatchStrideD1s,
BatchStrideE1,
a_element_op,
b0_element_op,
acc_element_op,
b1_element_op,
c_element_op);
return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
static_cast<const B0DataType*>(p_b0),
p_d0s,
static_cast<const B1DataType*>(p_b1),
p_d1s,
static_cast<E1DataType*>(p_c),
M,
N,
K,
O,
Batch,
StrideA,
StrideB0,
StrideD0s,
StrideB1,
StrideD1s,
StrideE1,
BatchStrideA,
BatchStrideB0,
BatchStrideD0s,
BatchStrideB1,
BatchStrideD1s,
BatchStrideE1,
a_element_op,
b0_element_op,
acc_element_op,
b1_element_op,
c_element_op);
}
static auto MakeInvoker() { return Invoker{}; }

View File

@@ -42,7 +42,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_gemm_xdl_cshuffle_v1(
const A0B0B1DataType* __restrict__ p_a0_grid,

View File

@@ -606,6 +606,26 @@ struct DeviceBatchedGemmMultiD_Wmma_CShuffleV3
return false;
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityAWaveTransfer(arg.M, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix A" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityBWaveTransfer(arg.N, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix B" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
return GridwiseGemm::CheckValidity(arg);
}

View File

@@ -33,14 +33,14 @@ template <typename GridwiseGemm,
TailNumber TailNum = TailNumber::Full>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
#endif
kernel_batched_gemm_xdl_cshuffle_v3_multi_d(BatchedGemmArg karg)
{
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t g_idx = blockIdx.z % karg.Batch;
const index_t k_idx = blockIdx.z / karg.Batch;
@@ -82,7 +82,7 @@ template <typename GridwiseGemm,
TailNumber TailNum = TailNumber::Full>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
#endif
kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds(BatchedGemmArg karg)
{
@@ -91,8 +91,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
{
// Pass two lds pointer is the key to tell compiler that ds_read/write
// operate on different lds chunk at same time without order dependecy
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t g_idx = blockIdx.z % karg.Batch;
const index_t k_idx = blockIdx.z / karg.Batch;

View File

@@ -588,6 +588,28 @@ struct DeviceBatchedGemmReduce_Wmma_CShuffleV3
return false;
}
if(ck::is_gfx12_supported() &&
!GridwiseGemm::CheckValidityAWaveTransfer(arg.MRaw_, arg.KRaw_))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix A" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
if(ck::is_gfx12_supported() &&
!GridwiseGemm::CheckValidityBWaveTransfer(arg.NRaw_, arg.KRaw_))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix B" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
typename GridwiseGemm::Argument gemm_arg{std::array<const void*, 1>{arg.p_a_grid_},
std::array<const void*, 1>{arg.p_b_grid_},
std::array<const void*, 0>{},

View File

@@ -39,7 +39,7 @@ template <typename GridwiseGemm,
bool HasMainK0BlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_reduce_xdl_cshuffle_v1(
const FloatAB* __restrict__ p_a_grid,
@@ -81,7 +81,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
p_reduces_grid(In) = p_reduces_grid(In) + d_batch_offset;
});
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainK0BlockLoop>(
p_a_grid + a_batch_offset,

View File

@@ -43,7 +43,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
const FloatAB* __restrict__ p_a_grid,

View File

@@ -44,7 +44,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
const FloatAB* __restrict__ p_a_grid,

View File

@@ -455,6 +455,26 @@ struct DeviceBatchedGemm_Wmma_CShuffleV3_Common
return false;
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityAWaveTransfer(arg.M, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix A" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityBWaveTransfer(arg.N, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix B" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
return GridwiseGemm::CheckValidity(arg);
}

View File

@@ -48,7 +48,7 @@ namespace device {
template <typename DeviceOp, typename GridwiseGemm, bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
{
@@ -67,7 +67,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
static_cast<long_index_t>(karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const auto a_grid_desc_k0_m_k1 =
amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(

View File

@@ -33,14 +33,14 @@ template <typename GridwiseGemm,
TailNumber TailNum = TailNumber::Full>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
#endif
kernel_batched_gemm_b_scale_xdl_cshuffle_v3(BatchedGemmArg karg)
{
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t g_idx = blockIdx.z % karg.Batch;
const index_t k_idx = blockIdx.z / karg.Batch;
@@ -83,8 +83,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
{
// Pass two lds pointer is the key to tell compiler that ds_read/write
// operate on different lds chunk at same time without order dependecy
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
const index_t g_idx = blockIdx.z % karg.Batch;
const index_t k_idx = blockIdx.z / karg.Batch;

View File

@@ -37,7 +37,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_contraction_multiple_abd_xdl_cshuffle(
AsPointer p_as_grid,
@@ -58,7 +58,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainKBlockLoop>(
p_as_grid,

View File

@@ -35,7 +35,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_contraction_multiple_d_xdl_cshuffle(
const FloatAB* __restrict__ p_a_grid,
@@ -56,7 +56,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
p_a_grid,

View File

@@ -471,6 +471,28 @@ struct DeviceGemmBiasAddReduce_Wmma_CShuffleV3
return false;
}
if(ck::is_gfx12_supported() &&
!GridwiseGemm::CheckValidityAWaveTransfer(arg.MRaw_, arg.KRaw_))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix A" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
if(ck::is_gfx12_supported() &&
!GridwiseGemm::CheckValidityBWaveTransfer(arg.NRaw_, arg.KRaw_))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix B" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
typename GridwiseGemm::Argument gemm_arg{
std::array<const void*, 1>{arg.p_a_grid_},
std::array<const void*, 1>{arg.p_b_grid_},

View File

@@ -701,6 +701,28 @@ struct DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3
return false;
}
if(ck::is_gfx12_supported() &&
!GridwiseGemmWelford::CheckValidityAWaveTransfer(arg.MRaw_, arg.KRaw_))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix A" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
if(ck::is_gfx12_supported() &&
!GridwiseGemmWelford::CheckValidityBWaveTransfer(arg.NRaw_, arg.KRaw_))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix B" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
typename GridwiseGemmWelford::Argument gemm_arg{
std::array<const void*, 1>{arg.p_a_grid_},
std::array<const void*, 1>{arg.p_b_grid_},

View File

@@ -38,7 +38,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_multiple_d_multiple_r_xdl_cshuffle(
const FloatAB* __restrict__ p_a_grid,
@@ -63,7 +63,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainKBlockLoop>(
p_a_grid,

View File

@@ -37,7 +37,7 @@ template <typename GridwiseGemm,
bool HasMainKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
__launch_bounds__(GridwiseGemm::MaxBlockSize, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_multiple_d_xdl_cshuffle(const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
@@ -57,7 +57,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#if defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__)
if constexpr(GridwiseGemm::template IsValidCompilationParameter<>())
{
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
p_a_grid,
@@ -899,7 +899,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
assert(desc.IsValid());
#endif
using GridwiseGemm = conditional_t<get_warp_size() == 64, GridwiseGemm64, GridwiseGemm32>;
__shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char
p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte(get_device_arch())];
if(desc.has_main_k_block_loop)
{
GridwiseGemm::template Run<true, InMemoryDataOperationEnum::Set>(

View File

@@ -456,6 +456,28 @@ struct DeviceGemmReduce_Wmma_CShuffleV3 : public DeviceGemmReduce<0, ReduceOpera
return false;
}
if(ck::is_gfx12_supported() &&
!GridwiseGemm::CheckValidityAWaveTransfer(arg.MRaw_, arg.KRaw_))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix A" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
if(ck::is_gfx12_supported() &&
!GridwiseGemm::CheckValidityBWaveTransfer(arg.NRaw_, arg.KRaw_))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix B" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
typename GridwiseGemm::Argument gemm_arg{std::array<const void*, 1>{arg.p_a_grid_},
std::array<const void*, 1>{arg.p_b_grid_},
std::array<const void*, 0>{},

View File

@@ -421,6 +421,26 @@ struct DeviceGemm_Wmma_CShuffleV3_Common
}
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityAWaveTransfer(arg.M, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix A" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityBWaveTransfer(arg.N, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix B" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
return GridwiseGemm::CheckValidity(arg);
}
};

View File

@@ -393,6 +393,26 @@ struct DeviceGemm_Wmma_CShuffleV3R1 : public DeviceGemmV2R1<ALayout,
return false;
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityAWaveTransfer(arg.M, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix A" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
if(ck::is_gfx12_supported() && !GridwiseGemm::CheckValidityBWaveTransfer(arg.N, arg.K))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Wave Transfer not applicable for matrix B" << __FILE__ << ":"
<< __LINE__ << ", in function: " << __func__ << std::endl;
}
return false;
}
return GridwiseGemm::CheckValidity(
*dynamic_cast<const typename GridwiseGemm::Argument*>(&arg));
}

View File

@@ -403,14 +403,29 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
KBatch_cond_choice.value == (arg.KBatch > 1) &&
tail_num_choice.value == tail_num)
{
const auto kernel = kernel_gemm_xdl_cshuffle_v3_mx< //
Use2LDS,
GridwiseGemm,
mainloop_choice.value,
CGlobalMemoryDataOperation,
minimum_occupancy,
tail_num_choice.value>;
Run(kernel);
if constexpr(is_same_v<BLayout, tensor_layout::gemm::MFMA>)
{
const auto kernel = kernel_gemm_xdl_cshuffle_v3_mx_bpreshuffle< //
Use2LDS,
GridwiseGemm,
mainloop_choice.value,
CGlobalMemoryDataOperation,
minimum_occupancy,
tail_num_choice.value>;
Run(kernel);
return;
}
else
{
const auto kernel = kernel_gemm_xdl_cshuffle_v3_mx< //
Use2LDS,
GridwiseGemm,
mainloop_choice.value,
CGlobalMemoryDataOperation,
minimum_occupancy,
tail_num_choice.value>;
Run(kernel);
}
}
});
return ave_time;

Some files were not shown because too many files have changed in this diff Show More